In [1]:
# from datasets import load_from_disk

# common_voice = load_from_disk("data/common_voice")
# print(common_voice)

from datasets import load_dataset, Audio

dataset = load_dataset("openslr/librispeech_asr")

dataset = dataset.remove_columns(["file", "speaker_id", "chapter_id", "id"])
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

del dataset["train.clean.100"]
del dataset["train.other.500"]
del dataset["test.other"]
del dataset["validation.other"]

print(dataset["train.clean.360"][0])

Using the latest cached version of the module from C:\Users\Lee\.cache\huggingface\modules\datasets_modules\datasets\openslr--librispeech_asr\2712a8f82f0d20807a56faadcd08734f9bdd24c850bb118ba21ff33ebff0432f (last modified on Thu Mar 27 15:43:55 2025) since it couldn't be found locally at openslr/librispeech_asr, or remotely on the Hugging Face Hub.


Loading dataset shards:   0%|          | 0/45 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/60 [00:00<?, ?it/s]

{'audio': {'path': 'C:\\Users\\Lee\\.cache\\huggingface\\datasets\\downloads\\extracted\\9de4d8114eadf757c2efb252d3b54ad35981e4ee7ab61c3c6dd6192c7d04ba18\\1487-133273-0000.flac', 'array': array([ 9.15527344e-05,  4.57763672e-04,  5.18798828e-04, ...,
       -4.57763672e-04, -5.49316406e-04, -4.88281250e-04], shape=(225920,)), 'sampling_rate': 16000}, 'text': 'THE SECOND IN IMPORTANCE IS AS FOLLOWS SOVEREIGNTY MAY BE DEFINED TO BE THE RIGHT OF MAKING LAWS IN FRANCE THE KING REALLY EXERCISES A PORTION OF THE SOVEREIGN POWER SINCE THE LAWS HAVE NO WEIGHT'}


In [2]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small",
    token="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")

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="English", task="transcribe",
    token="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")

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="English", task="transcribe",
    token="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")


In [3]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # batch["labels"] = tokenizer(batch["sentence"]).input_ids
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train.clean.360"], num_proc=1)

dataset = dataset.with_format("torch")

# from whisper_data_load import prep_data


# dataset = prep_data(dataset, feature_extractor, tokenizer)

In [4]:
from transformers import WhisperForConditionalGeneration
from MagnetWhisper import MagnetWhisper

# model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small",token="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")

model = WhisperForConditionalGeneration.from_pretrained("data/models/whisper/base")
 
 
model.generation_config.language = "english"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

model.__class__ = MagnetWhisper 
model.load_magnet([(1, .67)])

In [5]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [6]:
from evaluate import load

wer_metric = load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [7]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import os

os.environ["WANDB_PROJECT"] = "whisper-magnet"

MODEL_NAME = "t2-.67-long"

training_args = Seq2SeqTrainingArguments(
    output_dir=f"./data/models/whisper/{MODEL_NAME}",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    # gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_ratio=0.1,
    max_steps=16000,
    gradient_checkpointing=True,
    fp16=True, 
    eval_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    # save_steps=1000,
    save_total_limit=2,
    eval_steps=1000,
    logging_steps=25,
    report_to="wandb",
    greater_is_better=False,
    weight_decay=.005,
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train.clean.360"],
    eval_dataset=dataset["validation.clean"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)


  trainer = Seq2SeqTrainer(


In [8]:
# from transformers import TrainerCallback

# class EvaluateFirstStepCallback(TrainerCallback):
#     def on_step_begin(self, args, state, control, **kwargs):
#         if state.global_step == 1:
#             control.should_evaluate = True

# trainer.add_callback(EvaluateFirstStepCallback())

trainer.train()
trainer.save_model(f"./data/models/whisper/{MODEL_NAME}")

[34m[1mwandb[0m: Currently logged in as: [33mdavislee4891[0m ([33mdavislee4891-ohio-state-buckeyes[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
1000,0.0471,0.091357,4.646888
2000,0.0527,0.093406,4.812323
3000,0.0476,0.092358,4.678137
4000,0.0544,0.088108,4.569685
5000,0.0529,0.085567,4.477777
6000,0.052,0.081202,4.308665
7000,0.0274,0.081301,4.084409
8000,0.0328,0.07967,4.067865
9000,0.0309,0.079629,3.970442
10000,0.0309,0.077808,3.97228


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}. If this is not desired, please set these values explicitly.
The attention mask is not set and cannot be inferred from input because pad token is same as eos

In [9]:
def map_to_pred(batch):
    input_features = torch.tensor(batch["input_features"]).unsqueeze(0).to("cuda")  # Add batch dimension if missing

    batch["reference"] = processor.tokenizer._normalize(processor.decode(batch['labels']))

    with torch.no_grad():
        predicted_ids = model.generate(input_features)[0]
    transcription = processor.decode(predicted_ids)
    batch["prediction"] = processor.tokenizer._normalize(transcription)
    return batch

result = dataset["validation.clean"].map(map_to_pred)

wer = load("wer")
print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))



Map:   0%|          | 0/2703 [00:00<?, ? examples/s]

  input_features = torch.tensor(batch["input_features"]).unsqueeze(0).to("cuda")  # Add batch dimension if missing


3.6698419650352205
