# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [1]:
import tarfile
import os

tar_file = 'sl.tar'


extract_dir = 'working'

os.makedirs(extract_dir, exist_ok=True)

with tarfile.open(tar_file, 'r') as tar:
    tar.extractall(path=extract_dir)

print("Extraction completed.")

Extraction completed.


In [2]:
!pip install transformers[torch]
!pip install datasets
!pip install soundfile
!pip install hf_xet




In [1]:
from pathlib import Path
import pandas as pd

base_path = Path("working")
metadata_files = list(base_path.rglob("*.tsv"))

train_path = next(p for p in metadata_files if "train" in p.name)
validated_path = next(p for p in metadata_files if "validated" in p.name)

train_df = pd.read_csv(train_path, sep="\t")
validated_df = pd.read_csv(validated_path, sep="\t")

main_df = pd.concat([train_df, validated_df], ignore_index=True)

def ensure_mp3_extension(p):
    return p if p.endswith(".mp3") else p + ".mp3"

main_df["path"] = main_df["path"].apply(ensure_mp3_extension)

clips_dir = base_path / "clips"
main_df["audio_path"] = main_df["path"].apply(lambda p: (clips_dir / p).as_posix())

main_df["exists"] = main_df["audio_path"].apply(lambda p: Path(p).exists())

valid_df = main_df[main_df["exists"]].reset_index(drop=True)

print(f"Valid audio samples: {len(valid_df)}")


Valid audio samples: 1307


In [2]:
from datasets import Dataset

def prepare_dataset(df, sampling_rate=16000):
    if df.empty:
        raise ValueError("Input DataFrame is empty.")
    df = df.rename(columns={'audio_path': 'audio', 'sentence': 'text'})
    ds = Dataset.from_pandas(df[["audio", "text"]])
    return ds


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from pydub import AudioSegment
import numpy as np

def load_audio_pydub(path, target_sampling_rate=16000):
    audio = AudioSegment.from_file(path)
    if audio.frame_rate != target_sampling_rate:
        audio = audio.set_frame_rate(target_sampling_rate)
    samples = np.array(audio.get_array_of_samples()).astype(np.float32) / (2**15)
    if audio.channels > 1:
        samples = samples.reshape((-1, audio.channels)).mean(axis=1)
    return samples, target_sampling_rate

In [4]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("mrshu/wav2vec2-large-xlsr-slovene")

def preprocess_dataset(batch, processor):
    input_values = []
    attention_mask = []
    labels = []


    for audio_path, text in zip(batch["audio"], batch["text"]):
        speech_array, sampling_rate = load_audio_pydub(audio_path)

        inputs = processor(speech_array, sampling_rate=sampling_rate, return_attention_mask=True, padding=True)
        input_values.append(inputs["input_values"][0])
        attention_mask.append(inputs["attention_mask"][0])

        with processor.as_target_processor():
            label_ids = processor(text).input_ids
        labels.append(label_ids)

    return {
        "input_values": input_values,
        "attention_mask": attention_mask,
        "labels": labels
    }

dataset = prepare_dataset(valid_df, sampling_rate=16000)

processed_dataset = dataset.map(
    lambda batch: preprocess_dataset(batch, processor),
    batched=True,
    remove_columns=dataset.column_names,
    batch_size=8,
)

Map: 100%|██████████| 1307/1307 [02:35<00:00,  8.42 examples/s]


In [5]:
!pip install protobuf
!pip install soundfile datasets transformers[sentencepiece]
!pip install --upgrade transformers
def quick_test_training(dataset, max_samples=100):
    print(f"Running quick test with {max_samples} samples...")

    from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Trainer, TrainingArguments

    class Config:
        MODEL_NAME = "mrshu/wav2vec2-large-xlsr-slovene"
        SAMPLING_RATE = 16000
        NUM_EPOCHS = 2
        BATCH_SIZE = 8
        OUTPUT_DIR = "./wav2vec2-test"

    small_dataset = dataset.select(range(min(max_samples, len(dataset))))

    processor = Wav2Vec2Processor.from_pretrained(Config.MODEL_NAME)
    model = Wav2Vec2ForCTC.from_pretrained(Config.MODEL_NAME,ignore_mismatched_sizes=True, vocab_size=len(processor.tokenizer))

    model.freeze_feature_encoder()

    processed_dataset = dataset.map(
    lambda x: preprocess_dataset(x, processor),
    remove_columns=dataset.column_names
).filter(lambda x: x is not None)

    split = processed_dataset.train_test_split(test_size=0.1)
    train_dataset = split["train"]
    eval_dataset = split["test"]

    training_args = TrainingArguments(
        output_dir=Config.OUTPUT_DIR,
        per_device_train_batch_size=Config.BATCH_SIZE,
        eval_strategy="steps",
        num_train_epochs=Config.NUM_EPOCHS,
        save_steps=50,
        eval_steps=50,
        logging_steps=10,
        save_total_limit=2,
        fp16=True,
        report_to=[],
        remove_unused_columns=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=processor
    )

    print("Columns after preprocessing:", processed_dataset.column_names)
    print("First item:", processed_dataset[0])

    print("Starting training...")
    trainer.train()
    return trainer, processor




In [6]:
import transformers
print("Transformers version:", transformers.__version__)
print("TrainingArguments location:", transformers.TrainingArguments.__module__)
dataset = prepare_dataset(valid_df, sampling_rate=16000)
trainer, processor = quick_test_training(dataset)

Transformers version: 4.53.2
TrainingArguments location: transformers.training_args
Running quick test with 100 samples...


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at mrshu/wav2vec2-large-xlsr-slovene and are newly initialized because the shapes did not match:
- lm_head.weight: found shape torch.Size([31, 1024]) in the checkpoint and torch.Size([33, 1024]) in the model instantiated
- lm_head.bias: found shape torch.Size([31]) in the checkpoint and torch.Size([33]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map:   0%|          | 0/1307 [00:00<?, ? examples/s]


FileNotFoundError: [Errno 2] No such file or directory: 'w'

In [None]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("mrshu/wav2vec2-large-xlsr-slovene")


In [None]:
from transformers import TrainingArguments
args = TrainingArguments(output_dir="./test")
print(args)

In [None]:
from transformers import (
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer
)

In [None]:
@dataclass
class DataCollator:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Union[int, None] = None
    max_length_labels: Union[int, None] = None
    pad_to_multiple_of: Union[int, None] = None
    pad_to_multiple_of_labels: Union[int, None] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch

In [None]:
def add_noise(audio, noise_factor=0.005):
    """Add noise to audio for data augmentation"""
    noise = np.random.randn(len(audio))
    return audio + noise_factor * noise

In [None]:
def speed_change(audio, factor=None):
    """Change speed of audio"""
    if factor is None:
        factor = np.random.uniform(0.9, 1.1) # change the values to try
    indices = np.round(np.arange(0, len(audio), factor)).astype(int)
    indices = indices[indices < len(audio)]
    return audio[indices]

In [None]:
def preprocess_dataset(examples, processor, augment=False):
    """Preprocess examples"""

    input_values = []
    labels = []

    for audio_data, text in zip(examples["audio"], examples["text"]):
        try:
            audio_array = audio_data["array"]

            if augment and np.random.random() > 0.5:
                aug_type = np.random.choice(['noise', 'speed'])
                if aug_type == 'noise':
                    audio_array = add_noise(audio_array, noise_factor=0.005)
                elif aug_type == 'speed':
                    audio_array = speed_change(audio_array)

            if len(audio_array) > Config.MAX_INPUT_LENGTH:
                audio_array = audio_array[:Config.MAX_INPUT_LENGTH]

            input_values.append(audio_array)

            normalized_text = normalize_slovenian_text(text)
            labels.append(normalized_text)

        except Exception as e:
            print(f"Error processing example: {e}")
            continue

    if len(input_values) == 0:
        return {"input_values": [], "labels": []}

    inputs = processor(
        input_values,
        sampling_rate=Config.SAMPLING_RATE,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=Config.MAX_INPUT_LENGTH
    )

    with processor.as_target_processor():
        label_features = processor(labels).input_ids

    return {
        "input_values": inputs.input_values,
        "labels": label_features
    }


In [None]:
def save_results_to_notebook():

    import pandas as pd
    import matplotlib.pyplot as plt

    if os.path.exists("/working/wav2vec2-results/training_history.csv"):
        df = pd.read_csv("/working/wav2vec2-results/training_history.csv")

        plt.figure(figsize=(12, 4))

        plt.subplot(1, 2, 1)
        plt.plot(df['train_loss'], label='Train Loss')
        plt.plot(df['eval_loss'], label='Eval Loss')
        plt.title('Training Progress')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(df['learning_rate'])
        plt.title('Learning Rate')

        plt.tight_layout()
        plt.show()

        print(f"Final Train Loss: {df['train_loss'].iloc[-1]:.4f}")
        print(f"Final Eval Loss: {df['eval_loss'].iloc[-1]:.4f}")
        print(f"Total Steps: {len(df)}")

In [None]:
def train_the_dataset(dataset):

    print(f"Starting training with {len(dataset)} samples")

    train_test = dataset.train_test_split(test_size=0.2, seed=42)
    train_dataset = train_test["train"]
    eval_dataset = train_test["test"]

    print(f"Train samples: {len(train_dataset)}")
    print(f"Eval samples: {len(eval_dataset)}")

    vocab_dict = create_vocabulary_from_dataset(train_dataset)

    import os
    os.makedirs(Config.OUTPUT_DIR, exist_ok=True)

    with open(f"{Config.OUTPUT_DIR}/vocab.json", "w", encoding="utf-8") as f:
        json.dump(vocab_dict, f, ensure_ascii=False, indent=2)

    tokenizer = Wav2Vec2CTCTokenizer(
        vocab_file=f"{Config.OUTPUT_DIR}/vocab.json",
        unk_token="[UNK]",
        pad_token="[PAD]",
        word_delimiter_token=" "
    )

    feature_extractor = Wav2Vec2FeatureExtractor(
        feature_size=1,
        sampling_rate=Config.SAMPLING_RATE,
        padding_value=0.0,
        do_normalize=True,
        return_attention_mask=True
    )

    processor = Wav2Vec2Processor(
        feature_extractor=feature_extractor,
        tokenizer=tokenizer
    )

    print("Preprocessing training data...")

    def preprocess_train(examples):
        return preprocess_dataset(examples, processor, augment=True)

    def preprocess_eval(examples):
        return preprocess_dataset(examples, processor, augment=False)

    train_dataset = train_dataset.map(
        preprocess_train,
        batched=True,
        batch_size=8,
        remove_columns=train_dataset.column_names,
        num_proc=1
    )

    eval_dataset = eval_dataset.map(
        preprocess_eval,
        batched=True,
        batch_size=8,
        remove_columns=eval_dataset.column_names,
        num_proc=1
    )

    from transformers import Wav2Vec2ForCTC

    model = Wav2Vec2ForCTC.from_pretrained(
            Config.MODEL_NAME,
            attention_dropout=0.1,
            hidden_dropout=0.1,
            feat_proj_dropout=0.0,
            mask_time_prob=0.05,
            layerdrop=0.1,
            ctc_loss_reduction="mean",
            pad_token_id=processor.tokenizer.pad_token_id,
            vocab_size=len(processor.tokenizer),
            ctc_zero_infinity=True
        )

    model.freeze_feature_extractor()

    training_args = TrainingArguments(
            output_dir=Config.OUTPUT_DIR,
            group_by_length=True,
            per_device_train_batch_size=Config.BATCH_SIZE,
            per_device_eval_batch_size=Config.BATCH_SIZE,
            gradient_accumulation_steps=Config.GRADIENT_ACCUMULATION_STEPS,
            eval_strategy="steps",
            num_train_epochs=Config.NUM_EPOCHS,
            fp16=True,
            save_steps=Config.SAVE_STEPS,
            eval_steps=Config.EVAL_STEPS,
            logging_steps=100,
            learning_rate=Config.LEARNING_RATE,
            weight_decay=0.005,
            warmup_steps=Config.WARMUP_STEPS,
            save_total_limit=3,
            push_to_hub=False,
            dataloader_num_workers=0,
            remove_unused_columns=False,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
        )

    data_collator = DataCollator(processor=processor, padding=True)

    trainer = Trainer(
        model=model,
        data_collator=data_collator,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=processor.feature_extractor,
    )


    print("Starting training...")
    train_result = trainer.train()

    print("Saving model...")
    trainer.save_model()
    processor.save_pretrained(Config.OUTPUT_DIR)

    save_results_to_notebook()

    print(f"Training completed!")
    print(f"Final training loss: {train_result.training_loss:.4f}")
    print(f"Model saved to: {Config.OUTPUT_DIR}")

    return trainer, processor

In [None]:
def test_model(processor_path, model_path, test_audio_path):

    processor = Wav2Vec2Processor.from_pretrained(processor_path)
    model = Wav2Vec2ForCTC.from_pretrained(model_path)

    audio, sr = librosa.load(test_audio_path, sr=16000)

    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

    return transcription

In [None]:
def create_visual_results_dashboard(trainer, processor, vocab_dict, config):

    plt.style.use('default')
    sns.set_palette("husl")

    fig = plt.figure(figsize=(20, 12))

    if hasattr(trainer.state, 'log_history') and trainer.state.log_history:
        logs = trainer.state.log_history

        train_losses = []
        eval_losses = []
        learning_rates = []
        steps = []

        for log in logs:
            if 'train_loss' in log:
                train_losses.append(log['train_loss'])
                steps.append(log.get('step', len(train_losses)))
            if 'eval_loss' in log:
                eval_losses.append(log['eval_loss'])
            if 'learning_rate' in log:
                learning_rates.append(log['learning_rate'])

        plt.subplot(2, 4, 1)
        if train_losses:
            plt.plot(steps[:len(train_losses)], train_losses, 'b-', linewidth=2, label='Training Loss')
            plt.title('Training Loss Over Time', fontsize=14, fontweight='bold')
            plt.xlabel('Steps')
            plt.ylabel('Loss')
            plt.grid(True, alpha=0.3)
            plt.legend()

        plt.subplot(2, 4, 2)
        if eval_losses:
            eval_steps = np.linspace(0, len(train_losses), len(eval_losses))
            plt.plot(eval_steps, eval_losses, 'r-', linewidth=2, label='Validation Loss')
            plt.title('Validation Loss', fontsize=14, fontweight='bold')
            plt.xlabel('Steps')
            plt.ylabel('Loss')
            plt.grid(True, alpha=0.3)
            plt.legend()

        plt.subplot(2, 4, 3)
        if learning_rates:
            plt.plot(learning_rates, 'g-', linewidth=2)
            plt.title('Learning Rate Schedule', fontsize=14, fontweight='bold')
            plt.xlabel('Steps')
            plt.ylabel('Learning Rate')
            plt.grid(True, alpha=0.3)
            plt.ticklabel_format(style='scientific', axis='y', scilimits=(0,0))

    plt.subplot(2, 4, 4)

    char_types = {
        'Letters': 0,
        'Punctuation': 0,
        'Special': 0,
        'Space': 0
    }

    for char in vocab_dict.keys():
        if char == ' ':
            char_types['Space'] += 1
        elif char in '[PAD][UNK]':
            char_types['Special'] += 1
        elif char.isalpha():
            char_types['Letters'] += 1
        else:
            char_types['Punctuation'] += 1

    colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99']
    plt.pie(char_types.values(), labels=char_types.keys(), autopct='%1.1f%%',
            colors=colors, startangle=90)
    plt.title('Vocabulary Composition', fontsize=14, fontweight='bold')

    plt.subplot(2, 4, 5)

    train_size = len(trainer.train_dataset) if hasattr(trainer, 'train_dataset') else 0
    eval_size = len(trainer.eval_dataset) if hasattr(trainer, 'eval_dataset') else 0

    dataset_info = ['Train Samples', 'Eval Samples', 'Vocab Size']
    dataset_values = [train_size, eval_size, len(vocab_dict)]

    bars = plt.bar(dataset_info, dataset_values, color=['skyblue', 'lightcoral', 'lightgreen'])
    plt.title('Dataset Statistics', fontsize=14, fontweight='bold')
    plt.ylabel('Count')

    for bar, value in zip(bars, dataset_values):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(dataset_values)*0.01,
                str(value), ha='center', va='bottom', fontweight='bold')

    plt.subplot(2, 4, 6)
    plt.axis('off')

    config_text = f"""
    MODEL CONFIGURATION

    Model: {config.MODEL_NAME.split('/')[-1]}
    Learning Rate: {config.LEARNING_RATE}
    Batch Size: {config.BATCH_SIZE}
    Epochs: {config.NUM_EPOCHS}
    Max Audio Length: {config.MAX_INPUT_LENGTH // 16000}s

    VOCABULARY SAMPLE
    {list(vocab_dict.keys())[:15]}...

    TRAINING STATUS
    Training Completed
    Model Saved
    Processor Saved
    """

    plt.text(0.1, 0.9, config_text, transform=plt.gca().transAxes,
             fontsize=11, verticalalignment='top', fontfamily='monospace',
             bbox=dict(boxstyle="round,pad=0.5", facecolor="lightblue", alpha=0.8))

    plt.subplot(2, 4, 7)

    final_train_loss = train_losses[-1] if train_losses else 0
    final_eval_loss = eval_losses[-1] if eval_losses else 0
    total_steps = len(train_losses)

    performance_data = {
        'Metric': ['Final Train Loss', 'Final Eval Loss', 'Total Steps', 'Vocab Coverage'],
        'Value': [f'{final_train_loss:.4f}', f'{final_eval_loss:.4f}',
                 total_steps, f'{len(vocab_dict)} chars']
    }

    plt.axis('off')
    table_data = []
    for metric, value in zip(performance_data['Metric'], performance_data['Value']):
        table_data.append([metric, value])

    table = plt.table(cellText=table_data,
                     colLabels=['Metric', 'Value'],
                     cellLoc='left',
                     loc='center',
                     colWidths=[0.6, 0.4])
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2, 2)

    for i in range(len(table_data) + 1):
        for j in range(2):
            cell = table[(i, j)]
            if i == 0:  # Header
                cell.set_facecolor('#4CAF50')
                cell.set_text_props(weight='bold', color='white')
            else:  # Data rows
                cell.set_facecolor('#E8F5E8' if i % 2 == 0 else 'white')

    plt.title('📋 Performance Summary', fontsize=14, fontweight='bold', pad=20)

    plt.subplot(2, 4, 8)

    slovenian_chars = []
    regular_chars = []

    for char in vocab_dict.keys():
        if char in 'čšžČŠŽ':
            slovenian_chars.append(char)
        elif char.isalpha() and char not in ['[PAD]', '[UNK]']:
            regular_chars.append(char)

    char_analysis = ['Slovenian Chars', 'Regular Chars', 'Punctuation', 'Special']
    char_counts = [len(slovenian_chars), len(regular_chars),
                  len([c for c in vocab_dict.keys() if not c.isalnum() and c not in ' [PAD][UNK]']),
                  2]

    plt.bar(char_analysis, char_counts, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
    plt.title('Slovenian Language Analysis', fontsize=14, fontweight='bold')
    plt.ylabel('Character Count')
    plt.xticks(rotation=45)

    for i, count in enumerate(char_counts):
        plt.text(i, count + 0.1, str(count), ha='center', va='bottom', fontweight='bold')

    plt.tight_layout(pad=3.0)
    plt.suptitle('Wav2Vec2 Slovenian Training Results Dashboard',
                fontsize=20, fontweight='bold', y=0.98)

    output_path = "/kaggle/working/training_results_dashboard.png"
    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()

    print(f"Dashboard saved as: {output_path}")

    return output_path

def create_simple_summary_table(trainer, vocab_dict, config):

    print("\n" + "="*60)
    print(" SLOVENIAN WAV2VEC2 TRAINING SUMMARY")
    print("="*60)

    final_train_loss = "N/A"
    final_eval_loss = "N/A"
    total_steps = 0

    if hasattr(trainer.state, 'log_history') and trainer.state.log_history:
        logs = trainer.state.log_history
        train_losses = [log.get('train_loss') for log in logs if 'train_loss' in log]
        eval_losses = [log.get('eval_loss') for log in logs if 'eval_loss' in log]

        if train_losses:
            final_train_loss = f"{train_losses[-1]:.4f}"
            total_steps = len(train_losses)
        if eval_losses:
            final_eval_loss = f"{eval_losses[-1]:.4f}"

    summary_data = [
        ("Training Date", datetime.now().strftime("%Y-%m-%d %H:%M")),
        ("Model", config.MODEL_NAME.split('/')[-1]),
        ("Training Samples", len(trainer.train_dataset) if hasattr(trainer, 'train_dataset') else 0),
        ("Validation Samples", len(trainer.eval_dataset) if hasattr(trainer, 'eval_dataset') else 0),
        ("Vocabulary Size", len(vocab_dict)),
        ("Final Train Loss", final_train_loss),
        ("Final Eval Loss", final_eval_loss),
        ("Learning Rate", config.LEARNING_RATE),
        ("Epochs Completed", config.NUM_EPOCHS),
        ("Batch Size", config.BATCH_SIZE),
        ("Total Training Steps", total_steps),
    ]

    for label, value in summary_data:
        print(f"{label:<25} : {value}")

    print("="*60)

    print("VOCABULARY SAMPLE:")
    vocab_sample = list(vocab_dict.keys())[:20]
    print(f"   {vocab_sample}")
    if len(vocab_dict) > 20:
        print(f"   ... and {len(vocab_dict) - 20} more characters")

    print("="*60)
    print("Training completed successfully!")
    print(f"Model saved to: {config.OUTPUT_DIR}")
    print("="*60)

def show_training_results(trainer, processor, vocab_dict, config):

    dashboard_path = create_visual_results_dashboard(trainer, processor, vocab_dict, config)

    create_simple_summary_table(trainer, vocab_dict, config)

    print("\n SLOVENIAN CHARACTERS FOUND:")
    slovenian_chars = [char for char in vocab_dict.keys() if char in 'čšžćđČŠŽĆĐ']
    if slovenian_chars:
        print(f"   {slovenian_chars}")
    else:
        print("   No Slovenian-specific characters found in this small sample")

    print(f"\nVisual dashboard saved as PNG: {dashboard_path}")
    print("You can download this image file from Kaggle!")

    return dashboard_path

In [None]:
def quick_test_training(dataset, max_samples=100):

    print(f"Running quick test with {max_samples} samples...")

    small_dataset = dataset.select(range(min(max_samples, len(dataset))))

    Config.NUM_EPOCHS = 2
    Config.SAVE_STEPS = 50
    Config.EVAL_STEPS = 50
    Config.OUTPUT_DIR = "/working/wav2vec2-test"
    print(small_dataset[0])
    return train_the_dataset(small_dataset)

In [None]:
!pip install transformers[torch]
!pip install hf_xet

main_df = pd.concat([dfs['train'], dfs['validated']], ignore_index=True)
print("After concat:", main_df.shape)

main_df["audio_path"] = main_df["path"].apply(lambda p: str(base_clips_dir / "clips" / p))

main_df = main_df[main_df["audio_path"].apply(lambda x: Path(x).exists())]
print("After filtering valid paths:", len(main_df))

dataset = prepare_dataset(main_df, sampling_rate=Config.SAMPLING_RATE)

trainer, processor = quick_test_training(dataset)


In [None]:
!pip install accelerate>=0.26.0
!pip install transformers[torch]

def real_quick_test(dataset, max_samples=20):

    small_dataset = dataset.select(range(max_samples))

    Config.NUM_EPOCHS = 1
    Config.BATCH_SIZE = 2
    Config.SAVE_STEPS = 10
    Config.EVAL_STEPS = 10
    Config.OUTPUT_DIR = "/kaggle/working/wav2vec2-quick"

    trainer, processor = train_the_dataset(small_dataset)
    show_training_results(trainer, processor, vocab_dict, Config)


    return trainer, processor

trainer, processor = real_quick_test(dataset)


In [None]:
print(main_df["path"].head())

In [None]:
print(base_clips_dir)
print(list((base_clips_dir / "clips").glob("*.mp3"))[:3])

In [None]:
main_df["audio_path"] = main_df.apply(get_audio_path, axis=1)

print(main_df["audio_path"].head())
print(main_df["audio_path"].apply(lambda x: Path(x).exists()).value_counts())

In [None]:
print(main_df["audio_path"].head())
print(main_df["audio_path"].apply(lambda x: Path(x).exists()).value_counts())

In [None]:
print("Available metadata files:")
for name, df in dfs.items():
    print(f"{name}: {len(df)} rows, columns: {list(df.columns)}")

print("Main_df shape after concat:", main_df.shape)


In [None]:
print(main_df[["audio_path", "sentence"]].head())

In [None]:
print(main_df["audio_path"].head(10).tolist())

In [None]:
for path in main_df["audio_path"].head(10):
    print(Path(path).exists())

In [None]:
print(base_clips_dir)
print(list(Path(base_clips_dir / "clips").glob("*.mp3"))[:5])

In [None]:
main_df = pd.concat([dfs["train"], dfs["validated"]], ignore_index=True)
print(main_df.columns)
print(main_df[["path", "sentence"]].head())


In [None]:
main_df["audio_path"] = main_df["path"].apply(lambda p: str(base_clips_dir / "clips" / p))
main_df = main_df[main_df["audio_path"].apply(lambda x: Path(x).exists())]
print(f"✅ Valid audio samples: {len(main_df)}")

In [None]:
import random

# Pick 5 random entries from main_df
sample_paths = main_df["path"].sample(5).tolist()
for p in sample_paths:
    full_path = base_clips_dir / "clips" / p
    print(f"{full_path} → Exists? {Path(full_path).exists()}")

In [None]:
main_df = pd.concat([dfs["train"], dfs["validated"]], ignore_index=True)

In [None]:
print(main_df["path"].head())

In [None]:
if not main_df["path"].iloc[0].endswith(".mp3"):
    main_df["path"] = main_df["path"].apply(lambda x: x + ".mp3")

In [None]:
main_df["audio_path"] = main_df["path"].apply(lambda p: str(Path(base_clips_dir) / "clips" / p))

In [None]:
main_df["exists"] = main_df["audio_path"].apply(lambda x: Path(x).exists())
valid_df = main_df[main_df["exists"]]

In [None]:
print(f"✅ Valid audio samples: {len(valid_df)}")
print(valid_df[["audio_path", "sentence"]].head())

In [None]:
dataset = prepare_dataset(valid_df, sampling_rate=Config.SAMPLING_RATE)

In [None]:
import transformers
print(transformers.__version__)

In [None]:
!pip uninstall torchcodec -y
!pip install torch --index-url https://download.pytorch.org/whl/cpu
!pip install git+https://github.com/pytorch/torchcodec.git


In [32]:
import torchaudio.transforms as T

resampler = T.Resample(orig_freq=48000, new_freq=16000)

speech_array_16k = resampler(speech_array)


In [41]:
from pydub import AudioSegment

audio_path = 'working\\clips\\d5effe595a27a3e92d144e1e0a4b9451574082a8dcea3ab59a4270c77b495d0bc87fe388af5e96e709354446aa6d7e7bbbd592a17b042e05ec65a525f031541c.mp3'
audio = AudioSegment.from_file(audio_path)
print(f"Channels: {audio.channels}, Frame rate: {audio.frame_rate}, Duration: {len(audio)}ms")

Channels: 1, Frame rate: 48000, Duration: 3408ms
