In [1]:
import zipfile
import os

tar_file = r'D:\PyCharmMiscProject\slo.zip'

extract_dir = 'working'

os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(tar_file, 'r') as tar:
    tar.extractall(path=extract_dir)

print("Extraction completed.")

Extraction completed.


In [2]:
!pip install transformers[torch]
!pip install datasets
!pip install soundfile
!pip install hf_xet

Collecting transformers[torch]
  Downloading transformers-4.53.3-py3-none-any.whl.metadata (40 kB)
Collecting filelock (from transformers[torch])
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers[torch])
  Using cached huggingface_hub-0.33.4-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.17 (from transformers[torch])
  Using cached numpy-2.3.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting regex!=2019.12.17 (from transformers[torch])
  Using cached regex-2024.11.6-cp311-cp311-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers[torch])
  Using cached tokenizers-0.21.2-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers[torch])
  Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting tqdm>=4.27 (from transformers[torch])
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting torch>=2.1

In [1]:
from pathlib import Path
import pandas as pd

base_path = Path("working")
metadata_files = list(base_path.rglob("*.tsv"))

train_path = next(p for p in metadata_files if "train" in p.name)
validated_path = next(p for p in metadata_files if "validated" in p.name)

train_df = pd.read_csv(train_path, sep="\t")
validated_df = pd.read_csv(validated_path, sep="\t")

main_df = pd.concat([train_df, validated_df], ignore_index=True)

def ensure_mp3_extension(p):
    return p if p.endswith(".mp3") else p + ".mp3"

main_df["path"] = main_df["path"].apply(ensure_mp3_extension)

clips_dir = base_path / "clips"
main_df["audio_path"] = main_df["path"].apply(lambda p: (clips_dir / p).as_posix())

main_df["exists"] = main_df["audio_path"].apply(lambda p: Path(p).exists())

valid_df = main_df[main_df["exists"]].reset_index(drop=True)

print(f"Valid audio samples: {len(valid_df)}")

Valid audio samples: 1307


In [43]:
from transformers import (
 Wav2Vec2ForCTC,TrainingArguments, Trainer
)
from datasets import Dataset
from jiwer import wer, cer


def prepare_dataset(df, sampling_rate=16000):
    if df.empty:
        raise ValueError("Input DataFrame is empty.")
    df = df.rename(columns={'audio_path': 'audio', 'sentence': 'text'})
    ds = Dataset.from_pandas(df[["audio", "text"]])
    return ds

In [35]:
from pydub import AudioSegment
import numpy as np

def load_audio_pydub(path, target_sampling_rate=16000):
    audio = AudioSegment.from_file(path)
    if audio.frame_rate != target_sampling_rate:
        audio = audio.set_frame_rate(target_sampling_rate)
    samples = np.array(audio.get_array_of_samples()).astype(np.float32) / (2**15)
    if audio.channels > 1:
        samples = samples.reshape((-1, audio.channels)).mean(axis=1)
    return samples, target_sampling_rate

In [36]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("mrshu/wav2vec2-large-xlsr-slovene")

def preprocess_single_example(example, processor, sampling_rate=16000):
        from pydub import AudioSegment
        import numpy as np

        path = example["audio"]
        text = example["text"]

        audio = AudioSegment.from_file(path)
        if audio.frame_rate != sampling_rate:
            audio = audio.set_frame_rate(sampling_rate)
        samples = np.array(audio.get_array_of_samples()).astype(np.float32) / (2**15)
        if audio.channels > 1:
            samples = samples.reshape((-1, audio.channels)).mean(axis=1)

        inputs = processor(samples, sampling_rate=sampling_rate, return_attention_mask=True, padding=True)

        with processor.as_target_processor():
            labels = processor(text).input_ids

        return {
            "input_values": inputs.input_values[0],
            "attention_mask": inputs.attention_mask[0],
            "labels": labels
        }



In [37]:
from dataclasses import dataclass
from typing import Dict, List, Union
import torch

@dataclass
class DataCollatorCTCWithPadding:
        processor: Wav2Vec2Processor
        padding: Union[bool, str] = True

        def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
            input_features = [{"input_values":f["input_values"]} for f in features]
            label_features = [{"input_ids":f["labels"]} for f in features]

            batch = self.processor.pad(
                input_features,
                padding=self.padding,
                return_tensors="pt",
            )

            with self.processor.as_target_processor():
                labels_batch = self.processor.pad(
                    label_features,
                    padding=self.padding,
                    return_tensors="pt"
                )

            labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
            batch["labels"] = labels

            return batch

In [5]:
def quick_test_training(dataset, max_samples=100):
    print(f"Running quick test with {max_samples} samples...")

    from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Trainer, TrainingArguments


    class Config:
        MODEL_NAME = "mrshu/wav2vec2-large-xlsr-slovene"
        SAMPLING_RATE = 16000
        NUM_EPOCHS = 2
        BATCH_SIZE = 4
        OUTPUT_DIR = "./wav2vec2-test"

    small_dataset = dataset.select(range(min(max_samples, len(dataset))))

    processor = Wav2Vec2Processor.from_pretrained(Config.MODEL_NAME)
    model = Wav2Vec2ForCTC.from_pretrained(
        Config.MODEL_NAME,
        ignore_mismatched_sizes=True,
        vocab_size=len(processor.tokenizer)
    )

    model.freeze_feature_encoder()

    print("Processing dataset...")
    processed_dataset = small_dataset.map(
        lambda x: preprocess_single_example(x, processor),
        remove_columns=small_dataset.column_names,
        desc="Processing audio files"
    ).filter(lambda x: x is not None)

    print(f"Processed {len(processed_dataset)} examples")

    split = processed_dataset.train_test_split(test_size=0.1)
    train_dataset = split["train"]
    eval_dataset = split["test"]

    data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

    training_args = TrainingArguments(
        output_dir=Config.OUTPUT_DIR,
        per_device_train_batch_size=Config.BATCH_SIZE,
        eval_strategy="steps",
        logging_dir=f"{Config.OUTPUT_DIR}/logs",
        logging_strategy="steps",
        num_train_epochs=Config.NUM_EPOCHS,
        save_steps=100,
        eval_steps=100,
        logging_steps=5,
        save_total_limit=2,
        fp16=True,
        report_to=[],
        remove_unused_columns=False,
        dataloader_drop_last=False,
        group_by_length=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=processor.feature_extractor,
        data_collator=data_collator,
    )

    print("Starting training...")
    trainer.train()
    return trainer, processor

In [38]:
MODEL_NAME = "mrshu/wav2vec2-large-xlsr-slovene"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME, vocab_size=len(processor.tokenizer), ignore_mismatched_sizes=True)
model.freeze_feature_encoder()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at mrshu/wav2vec2-large-xlsr-slovene and are newly initialized because the shapes did not match:
- lm_head.weight: found shape torch.Size([31, 1024]) in the checkpoint and torch.Size([33, 1024]) in the model instantiated
- lm_head.bias: found shape torch.Size([31]) in the checkpoint and torch.Size([33]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
dataset = prepare_dataset(valid_df)
ds = dataset.map(lambda x: preprocess_single_example(x, processor), remove_columns=dataset.column_names)

Map: 100%|██████████| 1307/1307 [02:53<00:00,  7.52 examples/s]


In [40]:
split_ds = ds.train_test_split(test_size=0.1)
train_ds = split_ds["train"]
eval_ds = split_ds["test"]

In [42]:
training_args = TrainingArguments(
    output_dir="./wav2vec2-slovene",
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=50,
    save_steps=200,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    fp16=True,
    learning_rate=3e-4,
    save_total_limit=2,
    group_by_length=True,
    remove_unused_columns=False,
    report_to=[],
    logging_dir="./logs"
)

In [44]:
def compute_metrics(pred):
    pred_ids = np.argmax(pred.predictions, axis=-1)
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)

    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    return {
        "wer": wer(label_str, pred_str),
        "cer": cer(label_str, pred_str)
    }

data_collator = DataCollatorCTCWithPadding(processor=processor)


In [45]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()
trainer.save_model("./wav2vec2-slovene")
processor.save_pretrained("./wav2vec2-slovene")



Step,Training Loss,Validation Loss


In [7]:
metrics = trainer.evaluate()
print(metrics)



{'eval_loss': 10.612970352172852, 'eval_runtime': 8.2906, 'eval_samples_per_second': 0.603, 'eval_steps_per_second': 0.121, 'epoch': 2.0}


In [18]:
!pip install jiwer
from jiwer import wer

pred_output = trainer.predict(trainer.eval_dataset)





In [19]:
import numpy as np

pred_ids = np.argmax(pred_output.predictions, axis=-1)

pred_ids = pred_ids.tolist()

pred_str=processor.batch_decode(pred_ids, skip_special_tokens=True)

label_ids = pred_output.label_ids

label_ids = label_ids.tolist()

cleaned_labels = [[token_id for token_id in seq if token_id != -100] for seq in label_ids]

label_str = processor.batch_decode(cleaned_labels, skip_special_tokens=True)

In [20]:
wer_scrore = wer(label_str, pred_str)
print(f"WER: {wer_scrore:.2%}")

WER: 100.00%


In [15]:
print(type(predictions.label_ids))
print(type(predictions.label_ids[0]))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [None]:
quick_test_training(dataset, max_samples=1000)