## Login to Hugging Face

In [1]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token=token,  # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
model_name = "Waktaverse-whisper-KO-large-v3"  # ADD YOUR MODEL NAME HERE
username = "PathFinderKR"  # ADD YOUR USERNAME HERE
repo_id = f"{username}/{model_name}"  # repository id

## Login to Weights & Biases

In [3]:
import wandb

api_key = os.getenv("WANDB_API_KEY")
wandb.login(
    key=api_key  # ADD YOUR API KEY HERE
)
wandb.init(project=model_name)

[34m[1mwandb[0m: Currently logged in as: [33mpathfinderkr[0m ([33mwaktaverse[0m). Use [1m`wandb login --relogin`[0m to force relogin


## Imports

In [4]:
from IPython.display import display, Markdown

# pytorch
import torch

# huggingface
from transformers import (
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    pipeline,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
import evaluate

# datasets
from datasets import Audio, load_dataset, DatasetDict

## Device

In [5]:
# Device setup
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    #"mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

Device = cpu


In [6]:
# Flash Attention Implementation
if device == "cuda:0":
    if torch.cuda.get_device_capability()[0] >= 8: # Ampere, Ada, or Hopper GPUs
        attn_implementation = "flash_attention_2"
        torch_dtype = torch.float16
    else:
        attn_implementation = "eager"
        torch_dtype = torch.float16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float32
print(f"Attention Implementation = {attn_implementation}")

Attention Implementation = eager


## Hyperparameters

In [7]:
################################################################################
# Language
################################################################################
language = "Korean"
language_code = "ko"

################################################################################
# seed
################################################################################
seed=42
torch.manual_seed(seed)

################################################################################
# Generation parameters
################################################################################
max_new_tokens=100
chunk_length_s=30
batch_size=4
sampling_rate=16000

################################################################################
# Training parameters
################################################################################
output_dir="./results"
logging_dir="./logs"
save_strategy="epoch"
logging_strategy="steps" # "steps", "epoch"
if logging_strategy == "steps":
    logging_steps=10
else:
    logging_steps=None
evaluation_strategy="steps" # "steps", "epoch"
load_best_model_at_end=True
metric_for_best_model="wer"
greater_is_better=False
save_total_limit=1
report_to="wandb"

num_train_epochs=2
per_device_train_batch_size=4
gradient_accumulation_steps=4
gradient_checkpointing=True
learning_rate=2e-5
lr_scheduler_type="cosine" # "constant", "linear", "cosine"
warmup_ratio=0.1
optim = "adamw_torch"
weight_decay=0.01

## Model

In [8]:
# Model ID for base model
model_id = "openai/whisper-large-v3"

In [9]:
# load model and processor
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    device_map=device,
    #attn_implementation=attn_implementation,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
# Display the model architecture
display(Markdown(f'```{model}```'))

```WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        )
      )
      (layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): WhisperDecoder(
      (embed_tokens): Embedding(51866, 1280, padding_idx=50256)
      (embed_positions): WhisperPositionalEmbedding(448, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperDecoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (activation_fn): GELUActivation()
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (encoder_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        )
      )
      (layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
  )
  (proj_out): Linear(in_features=1280, out_features=51866, bias=False)
)```

## Dataset

In [11]:
# Dataset ID
dataset_id = "mozilla-foundation/common_voice_17_0"

In [12]:
# Load dataset
dataset = DatasetDict()
dataset["train"] = load_dataset(dataset_id, language_code, split="train", trust_remote_code=True)
dataset["validation"] = load_dataset(dataset_id, language_code, split="validation", trust_remote_code=True)
dataset["test"] = load_dataset(dataset_id, language_code, split="test", trust_remote_code=True)

In [13]:
# Dataset information
dataset

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 376
    })
    validation: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 330
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 339
    })
})

In [14]:
# Dataset sample
dataset["train"][0]

{'client_id': 'c6ea915bc1573d5253873335a07670c60dfe75546918f591b700e4a992ab7e44ab084bd752b3e69e197900cfa2e45e1f094af17209d83b5065456069e71aa84e',
 'path': '/Users/pathfinder/.cache/huggingface/datasets/downloads/extracted/f16a9158fde2a26b92da08c939e4ccdbbedbc17acd3d4600d513b12bdad3ac73/ko_train_0/common_voice_ko_36880067.mp3',
 'audio': {'path': '/Users/pathfinder/.cache/huggingface/datasets/downloads/extracted/f16a9158fde2a26b92da08c939e4ccdbbedbc17acd3d4600d513b12bdad3ac73/ko_train_0/common_voice_ko_36880067.mp3',
  'array': array([ 1.13686838e-13,  6.53699317e-13,  8.81072992e-13, ...,
         -2.78213702e-06,  1.59776391e-06,  3.08520976e-06]),
  'sampling_rate': 48000},
 'sentence': '그 이웃을 쳐서 거짓 증거하는 사람은 방망이요 칼이요 뾰족한 살이니라',
 'up_votes': 4,
 'down_votes': 0,
 'age': 'twenties',
 'gender': 'female_feminine',
 'accent': '서울',
 'locale': 'ko',
 'segment': '',
 'variant': ''}

In [15]:
# Tokenization sample
tokenized_sentence = processor.tokenizer(dataset["train"][0]["sentence"])
decoded_sentence = processor.tokenizer.decode(tokenized_sentence["input_ids"])

print(f"Tokenized Sentence: {tokenized_sentence}")
print(f"Decoded Sentence: {decoded_sentence}")

Tokenized Sentence: {'input_ids': [50258, 50364, 22069, 4329, 249, 225, 1638, 43517, 238, 2393, 3675, 1372, 241, 33830, 4285, 7116, 12211, 2124, 10006, 46407, 3946, 1206, 6639, 120, 3946, 1206, 531, 122, 108, 21799, 3049, 21155, 1129, 1425, 167, 2742, 50257], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Decoded Sentence: <|startoftranscript|><|notimestamps|>그 이웃을 쳐서 거짓 증거하는 사람은 방망이요 칼이요 뾰족한 살이니라<|endoftext|>


## Preprocessing

In [16]:
# Down sample audio
def preprocess_function(examples):
    audio = examples["audio"]
    
    examples["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=sampling_rate).input_features[0]
    examples["labels"] = processor.tokenizer(examples["sentence"], return_tensors="pt").input_ids
    
    return examples

# Preprocess dataset
dataset = dataset.map(preprocess_function, remove_columns=["audio", "sentence"], num_proc=4)

## Inference

In [17]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    return_timestamps=True,

    max_new_tokens=max_new_tokens,
    chunk_length_s=chunk_length_s,
    batch_size=batch_size
)

In [None]:
def speech_recognition(audio):
    text = pipe(audio)["text"]
    return result

In [18]:
sample = dataset["train"][0]["audio"]

In [19]:
result = speech_recognition(sample)

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


In [20]:
result

{'text': ' 그 이웃을 쳐서 거짓 증거하는 사람은 방망이요 칼이요 뾰족한 살인이라',
 'chunks': [{'timestamp': (0.0, 7.78),
   'text': ' 그 이웃을 쳐서 거짓 증거하는 사람은 방망이요 칼이요 뾰족한 살인이라'}]}

## Training

In [None]:
def compute_metrics(pred):
    metric = evaluate.load("wer")
    
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    
    return {"wer": wer}

In [21]:
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    logging_dir=logging_dir,
    save_strategy=save_strategy,
    logging_strategy=logging_strategy,
    logging_steps=logging_steps,
    evaluation_strategy=evaluation_strategy,
    load_best_model_at_end=load_best_model_at_end,
    metric_for_best_model=metric_for_best_model,
    greater_is_better=greater_is_better,
    save_total_limit=save_total_limit,
    report_to=report_to,
    
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,
    optim=optim,
    weight_decay=weight_decay,
    seed=seed
)

In [22]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.feature_extractor,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

NameError: name 'ds' is not defined

In [None]:
trainer.train()