In [9]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
NVIDIA L4


In [10]:
# Path to your extracted dataset
extract_dir = "/home/aodhan_h6/tess_data"



In [11]:
import os
import glob

print("Folders inside tess_data:")
print(os.listdir(extract_dir))

# Check .wav files
wav_files = glob.glob(f"{extract_dir}/**/*.wav", recursive=True)
print(f"Found {len(wav_files)} wav files")


Folders inside tess_data:
['TESS Toronto emotional speech set data', 'tess toronto emotional speech set data']
Found 5600 wav files


In [12]:
import os
import torch
import torchaudio
import pandas as pd
import numpy as np
import glob
import re
import random
from datasets import Dataset, Audio
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
print(torch.cuda.get_device_name(0))
# Load Whisper Model and Processor
model_name = "openai/whisper-tiny"

processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

model.to(device)



Using device: cuda
NVIDIA L4


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 384)
      (layers): ModuleList(
        (0-3): 4 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=384, out_features=384, bias=False)
            (v_proj): Linear(in_features=384, out_features=384, bias=True)
            (q_proj): Linear(in_features=384, out_features=384, bias=True)
            (out_proj): Linear(in_features=384, out_features=384, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          

In [13]:
extract_dir = "/home/aodhan_h6/tess_data"

print("Folders inside tess_data:")
print(os.listdir(extract_dir))

wav_files = glob.glob(f"{extract_dir}/**/*.wav", recursive=True)
print(f"Found {len(wav_files)} wav files")

sample_size = int(len(wav_files) * 0.25)
random.seed(42)
sampled_wav_files = random.sample(wav_files, sample_size)

audio_files, transcriptions, emotions = [], [], []

for file_path in sampled_wav_files:
    file_name = os.path.basename(file_path)
    word = re.search(r'[OY]AF_([a-z]+)_', file_name)
    word = word.group(1) if word else "unknown"
    emotion = re.search(r'_(angry|disgust|fear|happy|neutral|ps|sad)\.wav', file_name)
    emotion = emotion.group(1) if emotion else "unknown"
    audio_files.append(file_path)
    transcriptions.append(word)
    emotions.append(emotion)

df = pd.DataFrame({'audio': audio_files, 'text': transcriptions, 'emotion': emotions})
print(df.head())


Folders inside tess_data:
['TESS Toronto emotional speech set data', 'tess toronto emotional speech set data']
Found 5600 wav files
                                               audio  text  emotion
0  /home/aodhan_h6/tess_data/tess toronto emotion...   yes      sad
1  /home/aodhan_h6/tess_data/TESS Toronto emotion...  neat  neutral
2  /home/aodhan_h6/tess_data/TESS Toronto emotion...   lid      sad
3  /home/aodhan_h6/tess_data/TESS Toronto emotion...  lean  disgust
4  /home/aodhan_h6/tess_data/TESS Toronto emotion...  thin    angry


In [14]:
# Remove torchaudio.load() because audio is already loaded
def prepare_audio(batch):
    return batch  # Do nothing — audio already in correct format

dataset = Dataset.from_pandas(df)
dataset = dataset.cast_column("audio", Audio())

# No need to load audio again
dataset = dataset.map(prepare_audio, num_proc=2)

split = dataset.train_test_split(test_size=0.2)
train_dataset = split["train"]
test_dataset = split["test"]


Map (num_proc=2): 100%|██████████| 1400/1400 [00:00<00:00, 8773.41 examples/s]


In [18]:
def prepare_dataset_for_whisper(batch):
    audio = batch["audio"]

    # Resample if needed
    if audio["sampling_rate"] != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=audio["sampling_rate"], new_freq=16000)
        audio_array = resampler(torch.tensor(audio["array"]).float()).numpy()
    else:
        audio_array = audio["array"]

    # Extract log-Mel spectrogram features
    batch["input_features"] = processor.feature_extractor(
        audio_array,
        sampling_rate=16000
    ).input_features[0]

    # Encode text labels
    batch["labels"] = processor.tokenizer(batch["text"]).input_ids

    return batch


In [19]:
train_dataset = train_dataset.map(prepare_dataset_for_whisper, remove_columns=train_dataset.column_names)
test_dataset = test_dataset.map(prepare_dataset_for_whisper, remove_columns=test_dataset.column_names)


Map: 100%|███████████████████████| 1120/1120 [1:04:08<00:00,  3.44s/ examples]
Map: 100%|███████████████████████████| 280/280 [15:59<00:00,  3.43s/ examples]


In [21]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# Data collator to pad inputs
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        labels = [{"input_ids": feature["labels"]} for feature in features]

        batch = processor.feature_extractor.pad(input_features, return_tensors="pt")
        labels_batch = processor.tokenizer.pad(labels, return_tensors="pt")

        batch["labels"] = labels_batch["input_ids"]
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# WER metric
import evaluate
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    pred_str = [text.lower().strip() for text in pred_str]
    label_str = [text.lower().strip() for text in label_str]

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# Training configuration
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-tess-finetuned",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=5e-5,
    max_steps=300,  # Keep small for faster runs
    warmup_steps=10,
    fp16=True if device == "cuda" else False,
    evaluation_strategy="steps",
    eval_steps=20,
    save_steps=50,
    logging_steps=10,
    predict_with_generate=True,
    generation_max_length=225,
    save_total_limit=1,
    do_train=True,
    do_eval=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.tokenizer,
)

print("Starting training...")
trainer.train()

# Save final model
model.save_pretrained("./whisper-tess-finetuned-final")
processor.save_pretrained("./whisper-tess-finetuned-final")
print("Model saved to ./whisper-tess-finetuned-final")


  trainer = Seq2SeqTrainer(
max_steps is given, it will override any value given in num_train_epochs


Starting training...


Step,Training Loss,Validation Loss,Wer
20,0.0563,0.162372,0.185714
40,0.0277,0.117583,0.128571
60,0.0239,0.105752,0.117857
80,0.0644,0.079827,0.082143
100,0.0603,0.062198,0.078571
120,0.0171,0.039218,0.053571
140,0.0296,0.030725,0.028571
160,0.028,0.022731,0.021429
180,0.0114,0.021653,0.028571
200,0.0037,0.015744,0.014286


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Model saved to ./whisper-tess-finetuned-final


In [22]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainer

# Load Base Pretrained Whisper-tiny (no fine-tuning)
base_processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
base_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny").to(device)

# Create trainer for base model
trainer_base = Seq2SeqTrainer(
    model=base_model,
    args=training_args,  # Same args used for fine-tuned model
    eval_dataset=test_dataset,  # Same test set
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=base_processor.tokenizer,
)

print("Evaluating Pretrained Whisper...")
base_metrics = trainer_base.evaluate()

print("Evaluating Fine-tuned Whisper...")
finetuned_metrics = trainer.evaluate()

print("\nComparison of WER (Word Error Rate):")
print(f"Pretrained Whisper-tiny WER: {base_metrics['eval_wer']:.4f}")
print(f"Fine-tuned Whisper-tiny WER: {finetuned_metrics['eval_wer']:.4f}")


  trainer_base = Seq2SeqTrainer(
max_steps is given, it will override any value given in num_train_epochs


Evaluating Pretrained Whisper...


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Evaluating Fine-tuned Whisper...


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr


Comparison of WER (Word Error Rate):
Pretrained Whisper-tiny WER: 4.0036
Fine-tuned Whisper-tiny WER: 0.0107


ModuleNotFoundError: No module named 'matplotlib'