<a href="https://colab.research.google.com/github/Stanislav3423/VoiceRecognition/blob/main/VoiceRecognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
import os

# 1. Підключаємо Google Drive
drive.mount('/content/drive')

# 2. Шлях до вашого архіву на Drive (ЗМІНІТЬ, якщо назвали папку інакше)
# Якщо ви кинули просто в корінь диску, то шлях: '/content/drive/MyDrive/dataset.zip'
zip_path = '/content/drive/MyDrive/volyn_dialect_dataset.zip'

# 3. Розпаковка в середовище Colab
if os.path.exists(zip_path):
    print("Архів знайдено! Розпаковую...")
    !unzip -q "$zip_path" -d /content/
    print("Готово! Папка volyn_dialect_dataset створена в /content/")
else:
    print(f"ПОМИЛКА: Файл {zip_path} не знайдено. Перевірте шлях!")

Mounted at /content/drive
Архів знайдено! Розпаковую...
Готово! Папка volyn_dialect_dataset створена в /content/


In [None]:
!pip install torch torchaudio transformers jiwer librosa pandas soundfile accelerate datasets hf_xet sentencepiece protobuf

Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m78.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.14.3


In [None]:
import pandas as pd
import torch
import librosa
from transformers import pipeline, Wav2Vec2ForCTC, Wav2Vec2Processor
from jiwer import wer, cer
import os
import re

# --- НАЛАШТУВАННЯ ---
DATASET_DIR = "/content/volyn_dialect_dataset"
TEST_MANIFEST = os.path.join(DATASET_DIR, "test_manifest.csv")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Використовується пристрій: {device.upper()}")

MODELS_TO_TEST = [
    {"name": "openai/whisper-tiny", "type": "whisper"},
    {"name": "openai/whisper-small", "type": "whisper"},
    {"name": "anton-l/wav2vec2-large-xlsr-53-ukrainian", "type": "wav2vec2"}
]

def normalize_text(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r"[^а-яіїєґ\s]", "", text)
    return text.strip()

def load_audio(path):
    try:
        speech, _ = librosa.load(path, sr=16000)
        return speech
    except Exception as e:
        print(f"Помилка завантаження {path}: {e}")
        return None

def run_whisper(model_name, valid_audio):
    print(f"⏳ Запуск {model_name}...")
    pipe = pipeline("automatic-speech-recognition", model=model_name, device=device)
    if not valid_audio: return []
    predictions = pipe(valid_audio)
    if isinstance(predictions, list):
        return [p['text'] for p in predictions]
    else:
        return [predictions['text']]

def run_wav2vec2(model_name, valid_audio):
    print(f"Запуск {model_name}...")
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)

    results = []
    for audio in valid_audio:
        input_values = processor(audio, return_tensors="pt", sampling_rate=16000).input_values
        input_values = input_values.to(device)
        with torch.no_grad():
            logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]
        results.append(transcription)
    return results

def main():
    if not os.path.exists(TEST_MANIFEST):
        print(f"ПОМИЛКА: Не знайдено {TEST_MANIFEST}.")
        return

    df = pd.read_csv(TEST_MANIFEST)
    print(f"Маніфест завантажено: {len(df)} рядків.")

    valid_paths = []
    valid_sentences = []

    print("Перевірка шляхів...")
    for idx, row in df.iterrows():
        # !!! ВИПРАВЛЕННЯ WINDOWS ШЛЯХІВ !!!
        path_in_csv = row['path'].replace('\\', '/')

        if not path_in_csv.startswith("/"):
             full_path = os.path.join(DATASET_DIR, path_in_csv)
        else:
             full_path = path_in_csv

        if os.path.exists(full_path):
            valid_paths.append(full_path)
            valid_sentences.append(row['sentence'])
        else:
             if idx == 0: print(f"Не знайдено перший файл: {full_path}")

    if not valid_paths:
        print("Жодного файлу не знайдено! Перевірте правильність CSV.")
        return

    print(f"Успішно знайдено {len(valid_paths)} аудіофайлів. Завантаження...")
    audio_data = [load_audio(p) for p in valid_paths]
    audio_data = [a for a in audio_data if a is not None]

    references = [normalize_text(t) for t in valid_sentences]
    report_data = []

    for model_cfg in MODELS_TO_TEST:
        print(f"\n--- Тестування {model_cfg['name']} ---")
        try:
            if model_cfg["type"] == "whisper":
                predictions = run_whisper(model_cfg["name"], audio_data)
            else:
                predictions = run_wav2vec2(model_cfg["name"], audio_data)

            predictions_norm = [normalize_text(p) for p in predictions]
            error_wer = wer(references, predictions_norm)
            error_cer = cer(references, predictions_norm)

            print(f"Результат: WER={error_wer:.2f}, CER={error_cer:.2f}")
            report_data.append({"Model": model_cfg['name'], "WER": error_wer, "CER": error_cer})
        except Exception as e:
            print(f"Помилка моделі {model_cfg['name']}: {e}")

    print("\n=== РЕЗУЛЬТАТИ (PRE-TRAIN) ===")
    results_df = pd.DataFrame(report_data)
    print(results_df)
    results_df.to_csv("/content/benchmark_results_pre_train.csv", index=False)

if __name__ == "__main__":
    main()

Використовується пристрій: CUDA
Маніфест завантажено: 60 рядків.
Перевірка шляхів...
Успішно знайдено 60 аудіофайлів. Завантаження...

--- Тестування openai/whisper-tiny ---
⏳ Запуск openai/whisper-tiny...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

Device set to use cuda
`return_token_timestamps` is deprecated for WhisperFeatureExtractor and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


Результат: WER=0.43, CER=0.11

--- Тестування openai/whisper-small ---
⏳ Запуск openai/whisper-small...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

Device set to use cuda


Результат: WER=0.08, CER=0.02

--- Тестування anton-l/wav2vec2-large-xlsr-53-ukrainian ---
Запуск anton-l/wav2vec2-large-xlsr-53-ukrainian...




Результат: WER=0.17, CER=0.03

=== РЕЗУЛЬТАТИ (PRE-TRAIN) ===
                                      Model       WER       CER
0                       openai/whisper-tiny  0.432018  0.107059
1                      openai/whisper-small  0.081140  0.022745
2  anton-l/wav2vec2-large-xlsr-53-ukrainian  0.171053  0.030980


In [None]:
!pip install torchcodec
!pip install "datasets[audio]"



In [None]:
!pip install "datasets<3.0.0"

Collecting datasets<3.0.0
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets<3.0.0)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.6.1-py3-none-any.whl (177 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.6/177.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.0
    Uninstalling fsspec-2025.3.0:
      Successfully uninstalled fsspec-2025.3.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
[31mERROR: pip's d

In [None]:
import os
import pandas as pd
import torch
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Union
from datasets import Dataset, Audio
from transformers import (
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    TrainingArguments,
    Trainer
)
from jiwer import wer

# Базова модель
MODEL_NAME = "anton-l/wav2vec2-large-xlsr-53-ukrainian"

# Шляхи (Colab)
DATASET_DIR = "/content/volyn_dialect_dataset"
TRAIN_MANIFEST = os.path.join(DATASET_DIR, "train_manifest.csv")
OUTPUT_DIR = "/content/wav2vec2-volyn-tuned"

# Параметри навчання
BATCH_SIZE = 4
LEARNING_RATE = 1e-4
NUM_EPOCHS = 40
GRADIENT_ACCUMULATION_STEPS = 2

def prepare_dataset(batch):
    audio = batch["audio"]
    # Обробка аудіо
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    # Обробка тексту
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Розділяємо аудіо та текст
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Паддінг для аудіо
        batch = self.processor.feature_extractor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        # Паддінг для тексту
        with self.processor.as_target_processor():
            labels_batch = self.processor.tokenizer.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # Замінюємо паддінг у мітках на -100, щоб ігнорувати їх при розрахунку loss
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer_score = wer(label_str, pred_str)
    return {"wer": wer_score}

if __name__ == "__main__":
    print(f"Loading model: {MODEL_NAME}")

    processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
    model = Wav2Vec2ForCTC.from_pretrained(
        MODEL_NAME,
        ctc_loss_reduction="mean",
        pad_token_id=processor.tokenizer.pad_token_id
    )

    # Заморожуємо feature extractor для стабільності на малих даних
    model.freeze_feature_extractor()

    print("Preparing dataset...")
    if not os.path.exists(TRAIN_MANIFEST):
        print(f"ERROR: File not found {TRAIN_MANIFEST}")
    else:
        df = pd.read_csv(TRAIN_MANIFEST)

        def fix_path(path):
            path = path.replace('\\', '/')
            if not path.startswith('/'):
                return os.path.join(DATASET_DIR, path)
            return path

        df['path'] = df['path'].apply(fix_path)

        # Створення датасету
        train_dataset = Dataset.from_pandas(df)
        train_dataset = train_dataset.cast_column("path", Audio(sampling_rate=16000))
        train_dataset = train_dataset.rename_column("path", "audio")

        # Map функції підготовки
        train_dataset = train_dataset.map(prepare_dataset, remove_columns=train_dataset.column_names)

        data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

        training_args = TrainingArguments(
            output_dir=OUTPUT_DIR,
            group_by_length=True,
            per_device_train_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            eval_strategy="no",
            num_train_epochs=NUM_EPOCHS,
            fp16=True, # GPU прискорення
            save_steps=500,
            learning_rate=LEARNING_RATE,
            warmup_steps=100,
            save_total_limit=2,
            logging_steps=10,
            report_to="none"
        )

        trainer = Trainer(
            model=model,
            data_collator=data_collator,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=train_dataset,
            tokenizer=processor.feature_extractor,
        )

        print("--- STARTING TRAINING ---")
        trainer.train()

        print("--- SAVING MODEL ---")
        model.save_pretrained(OUTPUT_DIR)
        processor.save_pretrained(OUTPUT_DIR)

        output_zip = "/content/drive/MyDrive/AI_Lab4/volyn_model_result.zip"
        print(f"Archiving to {output_zip}...")
        os.system(f"zip -r {output_zip} {OUTPUT_DIR}")
        print("DONE! Model saved to Google Drive.")

Loading model: anton-l/wav2vec2-large-xlsr-53-ukrainian


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Preparing dataset...




Map:   0%|          | 0/90 [00:00<?, ? examples/s]

  trainer = Trainer(


--- STARTING TRAINING ---


  torch._C._get_cudnn_allow_tf32(),


Step,Training Loss
10,0.8198
20,0.7728
30,0.6716
40,0.5309
50,0.3645
60,0.2698
70,0.2054
80,0.1733
90,0.1762
100,0.096


--- SAVING MODEL ---
Archiving to /content/drive/MyDrive/AI_Lab4/volyn_model_result.zip...
DONE! Model saved to Google Drive.


In [None]:
import pandas as pd
import torch
import librosa
import numpy as np
import os
import re
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from jiwer import wer, cer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from difflib import SequenceMatcher

MODEL_PATH = "/content/wav2vec2-volyn-tuned"
DATASET_DIR = "/content/volyn_dialect_dataset"
TEST_MANIFEST = os.path.join(DATASET_DIR, "test_manifest.csv")

INTENT_MAP = {
    "де тут подають добру бульбу або драніки": "Intent_1",
    "знайди мені де купити справжнього мацика": "Intent_2",
    "треба нові постоли або якісь міцні чоботи бо мої порвалися": "Intent_3",
    "кудою мені ліпше дибати до замку": "Intent_4",
    "де тут ягоди продають хочу додому взяти": "Intent_5",
    "хочу поїхати на світязь подивитися на ту велику воду": "Intent_6",
    "душа бажає свіжих пирогів з сиром": "Intent_7",
    "чи не буде сьогодні плюти бо не маю парасолі": "Intent_8",
    "де можна позичити ровера щоб по місту проїхатись": "Intent_9",
    "на що тут можна глипнути щоб не нудитися": "Intent_10"
}

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device.upper()}")

def normalize_text(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r"[^а-яіїєґ\s]", "", text)
    return text.strip()

def get_closest_intent(text):
    """Знаходить найбільш схожу еталонну фразу і повертає її ID"""
    best_ratio = 0
    best_intent = "Unknown"
    norm_text = normalize_text(text)

    for phrase, intent_id in INTENT_MAP.items():
        # Порівнюємо схожість рядків (0.0 - 1.0)
        ratio = SequenceMatcher(None, norm_text, phrase).ratio()
        if ratio > best_ratio:
            best_ratio = ratio
            best_intent = intent_id

    return best_intent

def load_audio(path):
    try:
        speech, _ = librosa.load(path, sr=16000)
        return speech
    except Exception as e:
        return None

def main():
    if not os.path.exists(MODEL_PATH):
        print(f"ПОМИЛКА: Не знайдено модель у {MODEL_PATH}. Ви точно завершили тренування?")
        return

    print("Завантаження натренованої моделі...")
    processor = Wav2Vec2Processor.from_pretrained(MODEL_PATH)
    model = Wav2Vec2ForCTC.from_pretrained(MODEL_PATH).to(device)

    print("Завантаження даних...")
    df = pd.read_csv(TEST_MANIFEST)

    references_text = []
    predictions_text = []
    y_true = [] # Справжні інтенти
    y_pred = [] # Передбачені інтенти

    print("Початок тестування...")

    for idx, row in df.iterrows():
        # Виправлення шляхів
        path_in_csv = row['path'].replace('\\', '/')
        full_path = path_in_csv if path_in_csv.startswith("/") else os.path.join(DATASET_DIR, path_in_csv)

        if not os.path.exists(full_path):
            continue

        # 1. Розпізнавання (ASR)
        audio = load_audio(full_path)
        if audio is None: continue

        input_values = processor(audio, return_tensors="pt", sampling_rate=16000).input_values.to(device)

        with torch.no_grad():
            logits = model(input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]

        # 2. Збереження текстів
        ref_norm = normalize_text(row['sentence'])
        pred_norm = normalize_text(transcription)

        references_text.append(ref_norm)
        predictions_text.append(pred_norm)

        # 3. Визначення інтентів (Класифікація)
        true_intent = get_closest_intent(ref_norm) # Що мало бути
        pred_intent = get_closest_intent(pred_norm) # Що розпізнала модель

        y_true.append(true_intent)
        y_pred.append(pred_intent)


    print("\n" + "="*40)
    print(" РЕЗУЛЬТАТИ ТЕСТУВАННЯ (POST-TRAIN)")
    print("="*40)

    # 1. Метрики ASR (Технічні)
    error_wer = wer(references_text, predictions_text)
    error_cer = cer(references_text, predictions_text)
    print(f"\nASR Metrics:")
    print(f"WER (Word Error Rate):      {error_wer:.4f}")
    print(f"CER (Character Error Rate): {error_cer:.4f}")

    acc = accuracy_score(y_true, y_pred)
    f1 = classification_report(y_true, y_pred, output_dict=True)['macro avg']['f1-score']

    print(f"\nClassification Metrics:")
    print(f"Accuracy: {acc:.4f}, F1: {f1:.4f}")

    print("\nConfusion Matrix:")
    labels = sorted(list(INTENT_MAP.values()), key=lambda x: int(x.split('_')[1]))
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    print(cm)

    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, labels=labels))

    results_df = pd.DataFrame({
        "Reference": references_text,
        "Predicted": predictions_text,
        "True_Intent": y_true,
        "Pred_Intent": y_pred
    })
    results_df.to_csv("/content/final_test_results.csv", index=False)
    print("\nДетальний лог збережено у /content/final_test_results.csv")

if __name__ == "__main__":
    main()

Using device: CUDA
Завантаження натренованої моделі...
Завантаження даних...
Початок тестування...

 РЕЗУЛЬТАТИ ТЕСТУВАННЯ (POST-TRAIN)

ASR Metrics:
WER (Word Error Rate):      0.1491
CER (Character Error Rate): 0.0275

Classification Metrics:
Accuracy: 1.0000, F1: 1.0000

Confusion Matrix:
[[6 0 0 0 0 0 0 0 0 0]
 [0 6 0 0 0 0 0 0 0 0]
 [0 0 6 0 0 0 0 0 0 0]
 [0 0 0 6 0 0 0 0 0 0]
 [0 0 0 0 6 0 0 0 0 0]
 [0 0 0 0 0 6 0 0 0 0]
 [0 0 0 0 0 0 6 0 0 0]
 [0 0 0 0 0 0 0 6 0 0]
 [0 0 0 0 0 0 0 0 6 0]
 [0 0 0 0 0 0 0 0 0 6]]

Classification Report:
              precision    recall  f1-score   support

    Intent_1       1.00      1.00      1.00         6
    Intent_2       1.00      1.00      1.00         6
    Intent_3       1.00      1.00      1.00         6
    Intent_4       1.00      1.00      1.00         6
    Intent_5       1.00      1.00      1.00         6
    Intent_6       1.00      1.00      1.00         6
    Intent_7       1.00      1.00      1.00         6
    Intent_8       