# UKW Marine Radio Chatter - Bridge 2 Bridge Communication
This notebook uses pretrained models to transcribe the audio files from the UKW Marine Radio Chatter - Bridge 2 Bridge Communication dataset. <br>
The dataset contains audio files and their corresponding transcriptions. Further we classify the speakers contained in the audio files.

In [1]:
import os
import IPython
import torchaudio
import torch
import wandb
from pydub import AudioSegment
from pytorch_lightning.utilities.types import STEP_OUTPUT
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from src.utils import txt_to_dataframe
import numpy as np

## Configuration - Data Directories

In [2]:
class Config:
    DATA_DIR = '../data/'
    AUDIO_DIR = DATA_DIR + 'audio/'
    TEXT_DIR = DATA_DIR + 'text/'
    DATASET_DIR = 'dataset/'
    
    KAGGLE_DATA_TAG = 'linogova/marine-radio-chatter-bridge-2-bridge-communication/1'
    KAGGLE_DATA_DIR = 'Marine_audio/'

config = Config()

In [3]:
import pandas as pd
import torchaudio
import torch

def process_audio_segments(meta_df, waveform, sample_rate, padding=0.1):
    # Prepare lists to hold the processed data
    new_rows = []
    audio_segments = []
    
    # Initialize variables to track the current segment
    current_start = None
    current_end = None
    current_transcript = ""
    current_audio = []
    current_length = 0  # Length in seconds
    
    for index, row in meta_df.iterrows():
        transcript = row['transcript']
        start_time = row['start_time'] - padding
        end_time = row['end_time'] + padding
        
        # If the padding overlaps with the previous segment, adjust the start time
        if current_end is not None and row['start_time']- current_end < padding:
             start_time = current_end
            
        # If the padding overlaps with the next segment, adjust the end time
        if index < len(meta_df) - 1 and meta_df.iloc[index + 1]['start_time'] - end_time < padding:
            end_time = ((meta_df.iloc[index + 1]['start_time'] - end_time) / 2) + end_time
            
        # Calculate the duration of the current row's audio segment
        duration = end_time - start_time
        
        # If adding this segment exceeds 30 seconds, save the current segment and start a new one
        if current_length + duration > 30:
            if current_start is not None:
                new_rows.append({
                    'start_time': current_start,
                    'end_time': current_end,
                    'transcript': current_transcript
                })
                audio_segments.append(torch.cat(current_audio, dim=1).squeeze())
            
            # Reset for the new segment
            current_start = start_time
            current_end = end_time
            current_transcript = transcript
            current_audio = [waveform[:, int(start_time * sample_rate):int(end_time * sample_rate)]]
            current_length = duration
        else:
            # If it doesn't exceed 30 seconds, update the current segment
            if current_start is None:
                current_start = start_time
            current_end = end_time
            current_transcript += " " + str(transcript)
            current_audio.append(waveform[:, int(start_time * sample_rate):int(end_time * sample_rate)])
            current_length += duration
    
    # Add the last segment if any
    if current_start is not None:
        new_rows.append({
            'start_time': current_start,
            'end_time': current_end,
            'transcript': current_transcript
        })
        audio_segments.append(torch.cat(current_audio, dim=1).squeeze())
    
    return new_rows, audio_segments

# data_ids = [f.replace(".wav", "") for f in os.listdir(config.DATASET_DIR + "audio")]

# for idx in range(len(data_ids)):
#     if idx % 10 == 0:
#         print(f"Processing {idx}/{len(data_ids)}")
#     audio_fpath = os.path.join(config.DATASET_DIR, f"audio/{data_ids[idx]}.wav")
#     text_fpath = os.path.join(config.DATASET_DIR, f"text/{data_ids[idx]}.csv")
#     
#     waveform, sample_rate = torchaudio.load(audio_fpath)
#     waveform = waveform.float()
#     transcripts_df = pd.read_csv(text_fpath)
#     
#     target_segments, audio_segments = process_audio_segments(transcripts_df, waveform, sample_rate)

In [4]:
from scipy import signal
import os
import time
import pandas as pd
import torch
import torchaudio
from joblib import Parallel, delayed
from torch.utils.data import Dataset
from src.utils import bcolors

c = bcolors()


def lowpass_filter(audio_data, sr):
    # Create a lowpass filter
    b, a = signal.butter(4, 1300, 'low', fs=sr)
    # Apply the lowpass filter
    filtered_audio_data = signal.filtfilt(b, a, audio_data)
    return filtered_audio_data

def apply_rms_normalization(waveform):
    rms_value = waveform.pow(2).mean().sqrt()  # Calculate RMS value of the waveform
    target_rms = 0.1  # Example target RMS value
    normalized_waveform = waveform * (target_rms / rms_value)  # Scale waveform to desired RMS value
    return normalized_waveform

class UKWFunkSprache(Dataset):
    def __init__(self, 
                 file_ids, 
                 root_dir, 
                 proc=None,
                 padding=None,
                 rms_norm=False,
                 filter_data=False,
                 n_jobs=-1):
        self.feed_ids = file_ids
        self.root_dir = root_dir
        self.processor = proc
        self.rms_norm = rms_norm
        self.filter_data = filter_data
        self.padding = padding

        print(f"\n{c.OKGREEN}Preloading Samples...{c.ENDC}")
        print(f"\n{c.OKCYAN}Audio Files:         {len(self.feed_ids)}{c.ENDC}")
        print(f"{c.OKCYAN}Jobs:                {n_jobs} {c.ENDC}\n")

        start_time = time.time()
        result = []
        for idx in range(len(self.feed_ids)):
            result.append(self.process_file(idx))
        # result = Parallel(n_jobs=n_jobs)(
        #     delayed(self.process_file)(idx) for idx in range(len(self.feed_ids))
        # )
        result = [item for sublist in result for item in sublist]
        print(f"\n{c.OKGREEN}Preloading Complete!{c.ENDC}")

        self.audio_samples = [item['audio'] for item in result]
        self.transcriptions = [item['transcript'] for item in result]
        self.groups = [item['group'] for item in result]
        
        print(f"{c.OKCYAN}Number of Samples:   {len(self.audio_samples)} {c.ENDC}\n")

        end_time = time.time()
        t = end_time - start_time
        print(f"\n{c.OKBLUE}Time taken:      {int((t - (t % 60)) / 60)} min {t % 60} sec {c.ENDC}")

    def process_file(self, idx):
        feed_id = self.feed_ids[idx]
        audio_fpath = os.path.join(self.root_dir, f"audio/{feed_id}.wav")
        text_fpath = os.path.join(self.root_dir, f"text/{feed_id}.csv")

        waveform, sample_rate = torchaudio.load(audio_fpath, channels_first=True)
        waveform = waveform.float()
        transcripts_df = pd.read_csv(text_fpath)
        
        if self.rms_norm:
            waveform = apply_rms_normalization(waveform)
        
        if self.filter_data:
            waveform = lowpass_filter(waveform, sample_rate)
            
        audio_dur = transcripts_df.iloc[-1]['end_time'] - transcripts_df.iloc[0]['start_time']
        if audio_dur < 10:
            return []
        
        # batches = batch_data(transcripts_df.to_dict("records"), waveform)
        target_segments, audio_segments = process_audio_segments(transcripts_df, waveform, sample_rate, padding=self.padding)
        if len(audio_segments) == 0:
            return []
        
        samples = []
        
        for i in range(len(target_segments)):
            if self.processor:
                target = self.processor.tokenizer(target_segments[i]['transcript'], return_tensors="pt").input_ids.squeeze(0)
                audio = self.processor.feature_extractor(audio_segments[i], sampling_rate=sample_rate, return_tensors="pt").input_features.squeeze(0)
                samples.append({
                    'group': str(feed_id),
                    'audio': audio,
                    'transcript': target
                })
            else:
                samples.append({
                    'group': str(feed_id),
                    'audio': audio_segments[i],
                    'transcript': target_segments[i]['transcript']
                })
        return samples

    def __len__(self):
        return len(self.audio_samples)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        audio = self.audio_samples[idx]
        transcript = self.transcriptions[idx]

        return {
            "input_features": audio,
            "labels": transcript
        }


# Model

In [5]:
import torchmetrics
from transformers import get_linear_schedule_with_warmup
import pytorch_lightning as pl
import evaluate


class WhisperLightningModule(pl.LightningModule):
    def __init__(self, model_name: str, processor, learning_rate: float, weight_decay: float, warmup_steps: int, num_jobs: int = 8):
        super().__init__()
        self.save_hyperparameters()

        self.processor = processor
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.warmup_steps = warmup_steps
        self.num_jobs = num_jobs

        self.model = WhisperForConditionalGeneration.from_pretrained(model_name)
        self.wer = torchmetrics.text.wer.WordErrorRate()
        self.best_val_loss = float("inf")
        self.val_loss = []
        self.val_preds = []
        self.val_true = []

    def forward(self, input_features, labels):
        return self.model(input_features=input_features, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self(batch["input_features"], batch["labels"])
        loss = outputs.loss
        self.log("train_loss", loss, prog_bar=True, on_step=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self(batch["input_features"], batch["labels"])
        loss = outputs.loss
        self.val_loss.append(loss)
        self.log("val_loss", loss, prog_bar=True, on_step=True, on_epoch=True)

    #     self.val_preds.append(outputs.logits.argmax(-1))
    #     self.val_true.append(batch["labels"])

    # def on_validation_epoch_end(self) -> None:
    #     preds_decoded = [self.processor.decode(pred, skip_special_tokens=True) for b in self.val_preds for pred in b]
    #     true_decoded = [self.processor.decode(true, skip_special_tokens=True) for b in self.val_true for true in b]
    #     # self.val_preds = Parallel(n_jobs=self.num_jobs)(
    #     #     delayed(self.processor.decode)(pred, skip_special_tokens=True) for b in self.val_preds for pred in b
    #     # )
    #     # self.val_true = Parallel(n_jobs=self.num_jobs)(
    #     #     delayed(self.processor.decode)(true, skip_special_tokens=True) for b in self.val_true for true in b
    #     # )
    #
    #     wer = self.wer(preds_decoded, true_decoded)
    #     self.log("val_wer", wer, prog_bar=True)
    #
    #     self.val_preds = []
    #     self.val_true = []

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.trainer.estimated_stepping_batches
        )
        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]

In [6]:
import pytorch_lightning as pl
from torch.utils.data import DataLoader


class SpeechDataModule(pl.LightningDataModule):
    def __init__(self, train_dataset, val_dataset, processor, batch_size: int, num_workers: int = 8):
        super().__init__()
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.processor = processor
        self.batch_size = batch_size
        self.num_workers = num_workers

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, collate_fn=self.collate_fn, num_workers=self.num_workers)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, collate_fn=self.collate_fn, num_workers=self.num_workers)
    
    def test_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, collate_fn=self.collate_fn, num_workers=self.num_workers)

    def collate_fn(self, features):
        input_features = [feature["input_features"] for feature in features]
        batch = self.processor.feature_extractor.pad(
            [{"input_features": input_feature} for input_feature in input_features],
            return_tensors="pt"
        )

        labels = [feature["labels"] for feature in features]
        labels_batch = self.processor.tokenizer.pad(
            [{"input_ids": label} for label in labels],
            return_tensors="pt"
        )
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        if (labels[:, 0] == self.processor.tokenizer.pad_token_id).all():
            labels = labels[:, 1:]
        batch["labels"] = labels
        return batch


# Hyperparameter Finetuning

In [7]:
from transformers import WhisperProcessor

model_config = {
    "model_name": "openai/whisper-tiny",
}

# Initialize the processor
processor = WhisperProcessor.from_pretrained(
    model_config["model_name"], 
    language='en', 
    task="transcribe", 
    do_normalize=True, 
    sampling_rate=16000, 
    return_tensors="pt", 
    device="cpu",
    local_files_only=True
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
model_config["filter_data"] = False
model_config["rms_norm"] = False

# Create the Datasets
feed_ids = [f.replace(".wav", "") for f in os.listdir(config.DATASET_DIR + "audio")]

ds_train = UKWFunkSprache(
    feed_ids[:1700], 
    config.DATASET_DIR, 
    proc=processor, 
    filter_data=model_config["filter_data"], 
    rms_norm=model_config["rms_norm"],
    padding=0.5
)
ds_val = UKWFunkSprache(
    feed_ids[1700:], 
    config.DATASET_DIR, 
    proc=processor, 
    filter_data=model_config["filter_data"], 
    rms_norm=model_config["rms_norm"],
    padding=0.5
)

model_config["num_train_samples"] = len(ds_train)
model_config["num_val_samples"] = len(ds_val)


[92mPreloading Samples...[0m

[96mAudio Files:         1700[0m
[96mJobs:                -1 [0m


[92mPreloading Complete![0m
[96mNumber of Samples:   2938 [0m


[94mTime taken:      0 min 46.50473380088806 sec [0m

[92mPreloading Samples...[0m

[96mAudio Files:         300[0m
[96mJobs:                -1 [0m


[92mPreloading Complete![0m
[96mNumber of Samples:   511 [0m


[94mTime taken:      0 min 7.73941707611084 sec [0m


In [9]:
import gc
from transformers import WhisperForConditionalGeneration
import optuna
from pytorch_lightning.loggers import WandbLogger
import wandb


def train_model():
    wandb.init()
    
    # Initialize the WandbLogger
    wandb_logger = WandbLogger(
        project="ukw-radio-trans_" + model_config["model_name"].split("/")[-1], 
        name=f"lr_{wandb.config.lr:.6f}_wd_{wandb.config.weight_decay:.6f}",
        log_model=False
    )
    wandb.require(experiment="service") 
    
    data_module = SpeechDataModule(ds_train, ds_val, processor, wandb.config.batch_size)
    
    # Initialize the model with suggested hyperparameters
    model_train = WhisperLightningModule(model_config["model_name"], processor, wandb.config.lr, wandb.config.weight_decay, wandb.config.warmup_steps)
    model_train.model = WhisperForConditionalGeneration.from_pretrained(model_config["model_name"])
    model_train.model.generation_config.language = "en"
    model_train.model.generation_config.task = "transcribe"
    model_train.model.generation_config.is_multilingual = False
    model_train.model.generation_config.temperature = (0, 0.2, 0.4, 0.6, 0.8, 1.0)
    model_train.model.generation_config.compression_ratio_threshold = wandb.config.cr_threshold
    
    # Freeze or unfreeze layers based on the original configuration
    for param in model_train.model.parameters():
        param.requires_grad = False
        
    # Freeze layers in the decoder
    for param in model_train.model.model.decoder.parameters():
        param.requires_grad = wandb.config.unfreeze_decoder
        
    # Freeze layers in the encoder
    for param in model_train.model.model.encoder.parameters():
        param.requires_grad = wandb.config.unfreeze_encoder
        
    # Freeze layers in the linear layer
    model_train.model.proj_out.weight.requires_grad = wandb.config.unfreeze_linear
    
    # Initialize Early Stopping monitor the difference between the training and validation loss
    early_stopping = pl.callbacks.EarlyStopping("val_loss_epoch", patience=1, mode="min", min_delta=0.05, verbose=False)
    
    # Initialize the Trainer with WandbLogger
    trainer = pl.Trainer(
        max_epochs=wandb.config.n_epochs,
        logger=wandb_logger,
        accelerator="auto",
        log_every_n_steps=5,
        num_sanity_val_steps=5,
        callbacks=[early_stopping],
        enable_model_summary=False,
        enable_checkpointing=False
    )
    
    # Train the model
    trainer.fit(model_train, data_module)
    val_loss = trainer.callback_metrics["val_loss_epoch"].item()
    
    # Free up memory
    del model_train
    del data_module
    del trainer
    torch.cuda.empty_cache()
    gc.collect()
    
    # Return the validation loss
    return val_loss


In [11]:
sweep_config = {
    'method': 'bayes',
    'name': 'version-2',
    'metric': {
        'goal': 'minimize',
        'name': 'val_loss_epoch'
    },
    'parameters': {
        'lr': {'max': 0.0001, 'min': 0.00005},
        'weight_decay': {'max': 0.005, 'min': 0.0001},
        'batch_size': {'values': [8]},
        'warmup_steps': {'values': [400, 800]},
        'n_epochs': {'values': [3]},
        'unfreeze_encoder': {'values': [False]},
        'unfreeze_decoder': {'values': [True]},
        'unfreeze_linear': {'values': [True]},
        'cr_threshold': {'values': [1.2, 1.35, 1.5]}
    }
}

sweep_id=wandb.sweep(sweep_config, project="hp_tuning_" + model_config["model_name"].split("/")[-1])
wandb.agent(sweep_id=sweep_id, function=train_model, count=20)

Create sweep with ID: ams4q60w
Sweep URL: https://wandb.ai/tobias-ettling-wandb/hp_tuning_whisper-tiny/sweeps/ams4q60w


[34m[1mwandb[0m: Agent Starting Run: l5wczxkt with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	cr_threshold: 1.5
[34m[1mwandb[0m: 	lr: 8.429574733397666e-05
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: 	unfreeze_decoder: True
[34m[1mwandb[0m: 	unfreeze_encoder: False
[34m[1mwandb[0m: 	unfreeze_linear: True
[34m[1mwandb[0m: 	warmup_steps: 800
[34m[1mwandb[0m: 	weight_decay: 0.003907846891057378
[34m[1mwandb[0m: Currently logged in as: [33mtobias-ettling[0m ([33mtobias-ettling-wandb[0m). Use [1m`wandb login --relogin`[0m to force relogin


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/tobias/Desktop/Uni/SS24/NLP/UKW_SpeachToText/.venv/lib/python3.12/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████
train_loss_epoch,█▁
train_loss_step,█▆▆▅▇▄▃▃▃▄▃▃▂▃▃▂▃▄▂▁▁▁▁▂▁▄▂▂▂▁▂▂▂▂▂▂▁▂▂▂
trainer/global_step,▁▁▂▂▂▃▃▃▄▄▄▁▁▁▁▁▁▁▁▂▅▅▅▆▆▆▇▇▇██▂▂▂▂▂▂▂▂▂
val_loss_epoch,█▁
val_loss_step,▂▃▄▁▄▃▄█▇▆▃▅▂▃▂▃▅▂▃▂▁▃▄▆▃▄▂█▁▆▄▁▁▄▂▃▃▂▂▃

0,1
epoch,1.0
train_loss_epoch,1.00144
train_loss_step,0.68545
trainer/global_step,735.0
val_loss_epoch,1.13981
val_loss_step,1.09186


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: osuo2itu with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	cr_threshold: 1.2
[34m[1mwandb[0m: 	lr: 6.331437008602591e-05
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: 	unfreeze_decoder: True
[34m[1mwandb[0m: 	unfreeze_encoder: False
[34m[1mwandb[0m: 	unfreeze_linear: True
[34m[1mwandb[0m: 	warmup_steps: 800
[34m[1mwandb[0m: 	weight_decay: 0.0034848868475450745


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/tobias/Desktop/Uni/SS24/NLP/UKW_SpeachToText/.venv/lib/python3.12/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
train_loss_epoch,█▃▁
train_loss_step,█▆▅▅▄▄▅▃▃▄▃▃▃▂▃▃▂▂▃▂▃▅▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▁
trainer/global_step,▁▁▂▂▂▃▃▁▁▁▁▁▁▃▄▄▄▅▅▅▆▁▂▂▂▂▂▆▆▇▇▇██▂▂▂▂▂▂
val_loss_epoch,█▅▁
val_loss_step,▃▃▂▃▄█▇▆▂▄▃▄▄▅▃▂▄▄█▆▄▂▄▃▄▅▄▃▂▃▄█▁▃▁▄▄▃▄▃

0,1
epoch,2.0
train_loss_epoch,0.76394
train_loss_step,0.50226
trainer/global_step,1103.0
val_loss_epoch,1.06426
val_loss_step,1.19939


[34m[1mwandb[0m: Agent Starting Run: 8zu8k5z8 with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	cr_threshold: 1.5
[34m[1mwandb[0m: 	lr: 8.127392276847558e-05
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: 	unfreeze_decoder: True
[34m[1mwandb[0m: 	unfreeze_encoder: False
[34m[1mwandb[0m: 	unfreeze_linear: True
[34m[1mwandb[0m: 	warmup_steps: 800
[34m[1mwandb[0m: 	weight_decay: 0.0013632377464585649


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/tobias/Desktop/Uni/SS24/NLP/UKW_SpeachToText/.venv/lib/python3.12/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████
train_loss_epoch,█▁
train_loss_step,█▆▆▅▇▄▃▃▃▄▃▃▂▃▃▂▃▄▂▁▁▁▁▂▁▄▂▂▂▁▂▂▂▂▂▂▁▂▂▂
trainer/global_step,▁▁▂▂▂▃▃▃▄▄▄▁▁▁▁▁▁▁▁▂▅▅▅▆▆▆▇▇▇██▂▂▂▂▂▂▂▂▂
val_loss_epoch,█▁
val_loss_step,▂▃▄▁▄▃▄█▇▆▃▅▂▃▂▃▅▂▃▂▁▃▄▆▃▄▂█▁▆▄▁▁▄▂▃▃▂▂▃

0,1
epoch,1.0
train_loss_epoch,1.0044
train_loss_step,0.69763
trainer/global_step,735.0
val_loss_epoch,1.14376
val_loss_step,1.10966


[34m[1mwandb[0m: Agent Starting Run: u1i9xjv5 with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	cr_threshold: 1.2
[34m[1mwandb[0m: 	lr: 5.905986501916375e-05
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: 	unfreeze_decoder: True
[34m[1mwandb[0m: 	unfreeze_encoder: False
[34m[1mwandb[0m: 	unfreeze_linear: True
[34m[1mwandb[0m: 	warmup_steps: 800
[34m[1mwandb[0m: 	weight_decay: 0.0031373657992394195


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/tobias/Desktop/Uni/SS24/NLP/UKW_SpeachToText/.venv/lib/python3.12/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
train_loss_epoch,█▃▁
train_loss_step,█▆▅▅▄▄▅▃▃▄▃▃▃▂▃▃▂▂▃▂▃▅▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▁
trainer/global_step,▁▁▂▂▂▃▃▁▁▁▁▁▁▃▄▄▄▅▅▅▆▁▂▂▂▂▂▆▆▇▇▇██▂▂▂▂▂▂
val_loss_epoch,█▄▁
val_loss_step,▃▃▂▃▄█▇▆▂▄▄▄▄▅▃▂▄▄█▆▄▂▄▃▄▅▄▃▂▃▄█▁▃▁▄▄▃▄▃

0,1
epoch,2.0
train_loss_epoch,0.77517
train_loss_step,0.51539
trainer/global_step,1103.0
val_loss_epoch,1.06287
val_loss_step,1.2412


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: mi60mzom with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	cr_threshold: 1.2
[34m[1mwandb[0m: 	lr: 5.245358269258695e-05
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: 	unfreeze_decoder: True
[34m[1mwandb[0m: 	unfreeze_encoder: False
[34m[1mwandb[0m: 	unfreeze_linear: True
[34m[1mwandb[0m: 	warmup_steps: 800
[34m[1mwandb[0m: 	weight_decay: 0.0035430987113642017


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/tobias/Desktop/Uni/SS24/NLP/UKW_SpeachToText/.venv/lib/python3.12/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
train_loss_epoch,█▃▁
train_loss_step,█▆▅▅▄▄▅▃▃▄▃▃▃▂▃▃▂▂▃▂▃▅▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▁
trainer/global_step,▁▁▂▂▂▃▃▁▁▁▁▁▁▃▄▄▄▅▅▅▆▁▂▂▂▂▂▆▆▇▇▇██▂▂▂▂▂▂
val_loss_epoch,█▄▁
val_loss_step,▃▄▂▃▅█▇▆▂▄▄▄▄▅▃▂▄▄█▆▄▂▄▃▄▅▄▃▁▃▄█▁▃▁▄▄▃▄▃

0,1
epoch,2.0
train_loss_epoch,0.79421
train_loss_step,0.53765
trainer/global_step,1103.0
val_loss_epoch,1.06219
val_loss_step,1.26057


[34m[1mwandb[0m: Agent Starting Run: k6p4ggq0 with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	cr_threshold: 1.2
[34m[1mwandb[0m: 	lr: 5.9289937106594744e-05
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: 	unfreeze_decoder: True
[34m[1mwandb[0m: 	unfreeze_encoder: False
[34m[1mwandb[0m: 	unfreeze_linear: True
[34m[1mwandb[0m: 	warmup_steps: 800
[34m[1mwandb[0m: 	weight_decay: 0.0025586121805272655


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/tobias/Desktop/Uni/SS24/NLP/UKW_SpeachToText/.venv/lib/python3.12/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
train_loss_epoch,█▃▁
train_loss_step,█▆▅▅▄▄▅▃▃▄▃▃▃▂▃▃▂▂▃▂▃▅▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▁
trainer/global_step,▁▁▂▂▂▃▃▁▁▁▁▁▁▃▄▄▄▅▅▅▆▁▂▂▂▂▂▆▆▇▇▇██▂▂▂▂▂▂
val_loss_epoch,█▄▁
val_loss_step,▃▃▂▃▄█▇▆▂▄▄▄▄▅▃▂▄▄█▆▄▂▄▃▄▅▄▃▂▃▄█▁▃▁▄▄▃▄▃

0,1
epoch,2.0
train_loss_epoch,0.77454
train_loss_step,0.51462
trainer/global_step,1103.0
val_loss_epoch,1.06302
val_loss_step,1.24097


[34m[1mwandb[0m: Agent Starting Run: r3vv6mx7 with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	cr_threshold: 1.2
[34m[1mwandb[0m: 	lr: 6.798285311945353e-05
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: 	unfreeze_decoder: True
[34m[1mwandb[0m: 	unfreeze_encoder: False
[34m[1mwandb[0m: 	unfreeze_linear: True
[34m[1mwandb[0m: 	warmup_steps: 800
[34m[1mwandb[0m: 	weight_decay: 0.0029279606708938325


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/tobias/Desktop/Uni/SS24/NLP/UKW_SpeachToText/.venv/lib/python3.12/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

VBox(children=(Label(value='0.002 MB of 0.008 MB uploaded\r'), FloatProgress(value=0.3069948186528497, max=1.0…

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████
train_loss_epoch,█▁
train_loss_step,█▆▆▅▇▅▃▃▃▄▃▃▂▃▄▂▄▄▂▁▁▁▁▂▁▄▂▂▂▁▂▂▂▂▂▁▂▂▂▂
trainer/global_step,▁▁▂▂▂▃▃▃▄▄▄▁▁▁▁▁▁▁▁▂▅▅▅▆▆▆▇▇▇██▂▂▂▂▂▂▂▂▂
val_loss_epoch,█▁
val_loss_step,▂▃▄▁▄▃▄█▇▆▃▅▂▃▂▃▅▂▃▂▁▃▄▆▃▄▂█▁▆▃▁▁▄▂▃▃▂▂▃

0,1
epoch,1.0
train_loss_epoch,1.01842
train_loss_step,0.69373
trainer/global_step,735.0
val_loss_epoch,1.13026
val_loss_step,1.10101


[34m[1mwandb[0m: Agent Starting Run: 08fyp9yp with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	cr_threshold: 1.35
[34m[1mwandb[0m: 	lr: 6.343620123508794e-05
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: 	unfreeze_decoder: True
[34m[1mwandb[0m: 	unfreeze_encoder: False
[34m[1mwandb[0m: 	unfreeze_linear: True
[34m[1mwandb[0m: 	warmup_steps: 400
[34m[1mwandb[0m: 	weight_decay: 0.00414808776241688


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/tobias/Desktop/Uni/SS24/NLP/UKW_SpeachToText/.venv/lib/python3.12/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
train_loss_epoch,█▃▁
train_loss_step,█▅▅▄▃▄▄▃▃▄▃▃▃▂▃▃▂▂▃▂▃▄▂▂▂▂▂▂▁▁▁▂▂▂▂▁▁▁▂▁
trainer/global_step,▁▁▂▂▂▃▃▁▁▁▁▁▁▃▄▄▄▅▅▅▆▁▂▂▂▂▂▆▆▇▇▇██▂▂▂▂▂▂
val_loss_epoch,█▂▁
val_loss_step,▂▃▂▃▄█▇▆▂▄▃▄▃▅▃▂▄▄█▆▃▁▄▂▄▄▃▃▁▃▄█▁▃▁▄▄▃▄▃

0,1
epoch,2.0
train_loss_epoch,0.64387
train_loss_step,0.47519
trainer/global_step,1103.0
val_loss_epoch,1.06427
val_loss_step,1.03249


[34m[1mwandb[0m: Agent Starting Run: tnycbwsl with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	cr_threshold: 1.2
[34m[1mwandb[0m: 	lr: 6.346124608415621e-05
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: 	unfreeze_decoder: True
[34m[1mwandb[0m: 	unfreeze_encoder: False
[34m[1mwandb[0m: 	unfreeze_linear: True
[34m[1mwandb[0m: 	warmup_steps: 800
[34m[1mwandb[0m: 	weight_decay: 0.004288391302419717


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/tobias/Desktop/Uni/SS24/NLP/UKW_SpeachToText/.venv/lib/python3.12/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
train_loss_epoch,█▃▁
train_loss_step,█▆▅▅▄▄▅▃▃▄▃▃▃▂▃▃▂▂▃▂▃▅▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▁
trainer/global_step,▁▁▂▂▂▃▃▁▁▁▁▁▁▃▄▄▄▅▅▅▆▁▂▂▂▂▂▆▆▇▇▇██▂▂▂▂▂▂
val_loss_epoch,█▅▁
val_loss_step,▃▃▂▃▄█▇▆▂▄▃▄▄▅▃▂▄▄█▆▄▂▄▃▄▅▄▃▂▃▄█▁▃▁▄▄▃▄▃

0,1
epoch,2.0
train_loss_epoch,0.7635
train_loss_step,0.50234
trainer/global_step,1103.0
val_loss_epoch,1.06434
val_loss_step,1.2008


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


In [None]:
import gc
from transformers import WhisperForConditionalGeneration
import optuna
from pytorch_lightning.loggers import WandbLogger
import wandb

torch.manual_seed(42)

def objective(trial):
    # Define the search space for hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-3)
    weight_decay = trial.suggest_float("weight_decay", 1e-4, 1e-3)
    
    parameters = {
        "n_epochs": 3,
        "batch_size": 8,
        "learning_rate": learning_rate,
        "warmup_steps": 200,
        "weight_decay": weight_decay,
        "unfreeze_encoder": False,
        "unfreeze_decoder": True,
        "unfreeze_linear": False
    }
    
    # Initialize DataModule with the suggested batch size
    data_module = SpeechDataModule(ds_train, ds_val, processor, parameters["batch_size"])
    
    # Initialize the model with suggested hyperparameters
    model_train = WhisperLightningModule(model_config["model_name"], processor, parameters["learning_rate"], parameters["weight_decay"], parameters["warmup_steps"])
    model_train.model = WhisperForConditionalGeneration.from_pretrained(model_config["model_name"])
    model_train.model.generation_config.language = "en"
    model_train.model.generation_config.task = "transcribe"
    model_train.model.generation_config.is_multilingual = False
    
    # Freeze or unfreeze layers based on the original configuration
    for param in model_train.model.parameters():
        param.requires_grad = False
        
    # Freeze layers in the decoder
    for param in model_train.model.model.decoder.parameters():
        param.requires_grad = parameters["unfreeze_decoder"]
        
    # Freeze layers in the encoder
    for param in model_train.model.model.encoder.parameters():
        param.requires_grad = parameters["unfreeze_encoder"]
        
    # Freeze layers in the linear layer
    model_train.model.proj_out.weight.requires_grad = parameters["unfreeze_linear"]
    
    
    # Initialize the WandbLogger
    wandb_logger = WandbLogger(
        project="ukw-radio-trans_" + model_config["model_name"].split("/")[-1], 
        name=f"lr_{parameters["learning_rate"]:.6f}_wd_{parameters["weight_decay"]:.6f}",
        log_model=False)
    wandb_logger.log_hyperparams(parameters)
    
    # Initialize Early Stopping monitor the difference between the training and validation loss
    early_stopping = pl.callbacks.EarlyStopping("val_loss", patience=1, mode="min", min_delta=0.1, verbose=False)
    
    # Initialize the Trainer with WandbLogger
    trainer = pl.Trainer(
        max_epochs=parameters["n_epochs"],
        logger=wandb_logger,
        accelerator="auto",
        log_every_n_steps=5,
        num_sanity_val_steps=5,
        callbacks=[early_stopping]
    )
    
    # Train the model
    trainer.fit(model_train, data_module)
    val_loss = trainer.callback_metrics["val_loss"].item()
    
    # Finish the WandbLogger run
    wandb.finish()
    
    # Free up memory
    del model_train
    del data_module
    del trainer
    torch.cuda.empty_cache()
    gc.collect()
    
    # Return the validation loss
    return val_loss

# Create a study object and optimize the objective function
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

# Print the best hyperparameters found
print("Best trial:")
best_trial = study.best_trial
print(f"  Value: {best_trial.value}")
print("  Params: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")
