In [3]:
from jupyter_core.paths import jupyter_runtime_dir
print(jupyter_runtime_dir())

/storage/student5/xuan_quy/asrdata/cache/


In [9]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import librosa
import torch
from jiwer import wer 
device = torch.device("cuda:3")

processor = Wav2Vec2Processor.from_pretrained("khanhld/wav2vec2-base-vietnamese-160h")
model = Wav2Vec2ForCTC.from_pretrained("khanhld/wav2vec2-base-vietnamese-160h")
model.to(device)
print(f"Total Parameters: {sum(p.numel() for p in model.parameters())}")

def transcribe(wav):
    input_values = processor(wav, sampling_rate=16000, return_tensors="pt").input_values
    print(input_values.shape)
    logits = model(input_values.to(device)).logits
    pred_ids = torch.argmax(logits, dim=-1)
    pred_transcript = processor.batch_decode(pred_ids)[0]
    
    return pred_transcript


wav, _ = librosa.load('/storage/student5/xuan_quy/asrdata/wav/2.wav', sr = 16000)
print(f"transcript: {transcribe(wav)}")
with open("txt/2.txt","r") as f:
    true = f.read()
    print(f"True: {true}")
print(f"Score: {wer(transcribe(wav), true)}")
    

Total Parameters: 94445536
torch.Size([1, 102080])
transcript: tướng ca pu chia còn yêu cầu thủ tướng sin ga po phải điều chỉnh phát biểu không đúng sự thật chút nào này theo lợi ông
True: tướng cam pu chia còn yêu cầu thủ tướng sinh ga po phải điều chỉnh phát biểu không đúng sự thật chút nào này theo lời ông
torch.Size([1, 102080])
Score: 0.1111111111111111


In [2]:
import onnx
# import onnxruntime as ort
def export(model):
    """Export model into onnx format"""
    dummy_input = torch.randn(1, 16000)
    input_names = ["audio"]
    output_names = ["text"]
    torch.onnx.export(model, dummy_input, "wav2vec2-base-vietnamese-160h.onnx", input_names=input_names, output_names=output_names, opset_version=12)

In [7]:
import torch
torch.set_default_device('cuda:3')
print(torch.get_default_device())


cuda:3


In [3]:
import torch
from datasets import load_dataset, Audio
import json
from transformers import (
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer
)
from evaluate import load
import re
from dataclasses import dataclass
from typing import Dict, List, Union
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import torchaudio
from pathlib import Path
from torchsummary import summary
torch.set_default_device('cuda:3')
print(torch.get_default_device())

# New function to create metadata CSV files
# def create_metadata_csv(audio_dir, transcription_dir, output_dir, test_size=0.2):
#     """
#     Create CSV metadata files from directories containing audio files and transcriptions.
    
#     Parameters:
#     - audio_dir: Directory containing .wav files
#     - transcription_dir: Directory containing transcription files
#     - output_dir: Directory to save the CSV files
#     - test_size: Proportion of data to use for testing
#     """
#     data = []
    
#     # Create output directory if it doesn't exist
#     os.makedirs(output_dir, exist_ok=True)
    
#     # Iterate through audio files
#     for audio_file in Path(audio_dir).glob("*.wav"):
#         file_stem = audio_file.stem
        
#         # Look for corresponding transcription file
#         # Assuming transcriptions are in .txt files with same name as audio
#         trans_file = Path(transcription_dir) / f"{file_stem}.txt"
        
#         if trans_file.exists():
#             # Read transcription
#             with open(trans_file, 'r', encoding='utf-8') as f:
#                 transcription = f.read().strip()
#                 print(f"Processing {audio_file}: {transcription}")
#             # Verify audio file
#             try:
#                 waveform, sample_rate = torchaudio.load(str(audio_file))
#                 duration = waveform.shape[1] / sample_rate
                
#                 # Only include files that are valid
#                 if duration > 0:
#                     data.append({
#                         'audio': str(audio_file.absolute()),
#                         'text': transcription,
#                         'duration': duration
#                     })
#             except Exception as e:
#                 print(f"Error processing {audio_file}: {str(e)}")
#             print(f"Processed {audio_file}")    
#     # Create DataFrame
#     df = pd.DataFrame(data)
    
#     # Split into train and test sets
#     train_df, test_df = train_test_split(
#         df, 
#         test_size=test_size, 
#         random_state=42
#     )
    
#     # Save CSV files
#     train_df.to_csv(os.path.join(output_dir, 'train.csv'), index=False)
#     test_df.to_csv(os.path.join(output_dir, 'test.csv'), index=False)
    
#     print(f"Created metadata files:")
#     print(f"Train set: {len(train_df)} samples")
#     print(f"Test set: {len(test_df)} samples")
    
#     return os.path.join(output_dir, 'train.csv'), os.path.join(output_dir, 'test.csv')

# Previous functions remain the same
def prepare_vietnamese_text(batch):
    text = re.sub(r'[^\w\s]', '', batch["text"].lower())
    return text

def create_vocabulary(dataset):
    vocab_dict = {
        "<pad>": 0,
        "<unk>": 1,
        "<s>": 2,
        "</s>": 3,
    }
    
    chars = set()
    for text in dataset["text"]:
        chars.update(list(prepare_vietnamese_text({"text": text})))
    
    for i, char in enumerate(sorted(list(chars))):
        vocab_dict[char] = i + 4
    
    return vocab_dict

def prepare_dataset(batch):
    audio = batch["audio"]
    
    batch["input_values"] = processor(
        audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_attention_mask=True
    ).input_values[0]
    
    batch["labels"] = processor(
        text=prepare_vietnamese_text(batch)
    ).input_ids
    
    return batch

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]
        
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        
        labels_batch = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(feat) for feat in label_features],
            batch_first=True,
            padding_value=-100
        )
        
        batch["labels"] = labels_batch
        
        return batch

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=True)
    
    wer_metric = load("wer")
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    
    return {"wer": wer}

def train_vietnamese_asr(audio_dir, transcription_dir, output_dir):
    """
    Main function to prepare data and train the model
    
    Parameters:
    - audio_dir: Directory containing .wav files
    - transcription_dir: Directory containing transcription filesload_metric
    - output_dir: Directory to save the model and metadata
    """
    # Create metadata CSV files
    # print("Creating metadata CSV files...")
    # train_csv, test_csv = create_metadata_csv(
    #     audio_dir=audio_dir,
    #     transcription_dir=transcription_dir,
    #     output_dir=output_dir
    # )
    
    # Load dataset
    train_csv ='dataset/train.csv'
    test_csv = 'dataset/test.csv'


    dataset = load_dataset(
        "csv", 
        data_files={"train": train_csv, "test": test_csv},
        delimiter=","
    )
    
    # Load audio files
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
    
    # Create vocabulary and save it
    # vocab_dict = create_vocabulary(dataset["train"])
    # print(vocab_dict)
    # vocab_path = os.path.join(output_dir, "vocab.json")
    
    # with open(vocab_path, 'w', encoding='utf-8') as f:
    #     json.dump(vocab_dict, f, ensure_ascii=False, indent=2)
    vocab_path = 'dataset/vocab.json'
    tokenizer = Wav2Vec2CTCTokenizer(
        vocab_path,
        unk_token="<unk>",
        pad_token="<pad>",
        word_delimiter_token="|"
    )
    
    # Create feature extractor
    feature_extractor = Wav2Vec2FeatureExtractor(
        feature_size=1,
        sampling_rate=16_000,
        padding_value=0.0,
        do_normalize=True,
        return_attention_mask=False
    )
    
    # Create processor
    global processor
    processor = Wav2Vec2Processor(
        feature_extractor=feature_extractor,
        tokenizer=tokenizer
    )
    
    # Process dataset
    processed_dataset = dataset.map(
        prepare_dataset,
        remove_columns=dataset.column_names["train"],
        num_proc=4
    )
    
    # Load pre-trained model
    model = Wav2Vec2ForCTC.from_pretrained(
        "khanhld/wav2vec2-base-vietnamese-160h",
        attention_dropout=0.1,
        hidden_dropout=0.1,
        feat_proj_dropout=0.0,
        mask_time_prob=0.05,
        layerdrop=0.1,
        ctc_loss_reduction="mean",
        pad_token_id=processor.tokenizer.pad_token_id,
        vocab_size=len(processor.tokenizer)
    )
    # print(f"Model summary: {summary(model)}")
    model.freeze_feature_encoder()
    
    training_args = TrainingArguments(
        output_dir=os.path.join(output_dir, "vietnamese-wav2vec2"),
        group_by_length=True,
        per_device_train_batch_size=16,
        gradient_accumulation_steps=2,
        evaluation_strategy="steps",
        num_train_epochs=30,
        fp16=True,
        save_steps=500,
        eval_steps=500,
        logging_steps=500,
        learning_rate=1e-4,
        warmup_steps=1000,
        save_total_limit=2
    )
    
    trainer = Trainer(
        model=model,
        data_collator=DataCollatorCTCWithPadding(processor=processor, padding=True),
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=processed_dataset["train"],
        eval_dataset=processed_dataset["test"],
        tokenizer=processor.feature_extractor,
    )
    
    trainer.train()
    trainer.save_model(os.path.join(output_dir, "vietnamese-wav2vec2-final"))

if __name__ == "__main__":
    print("Starting...")
    train_vietnamese_asr(
        audio_dir="wav",
        transcription_dir="txt",
        output_dir="dataset"
    )

cuda:3
Starting...


RuntimeError: Error(s) in loading state_dict for Wav2Vec2ForCTC:
	size mismatch for lm_head.weight: copying a param with shape torch.Size([96, 768]) from checkpoint, the shape in current model is torch.Size([98, 768]).
	size mismatch for lm_head.bias: copying a param with shape torch.Size([96]) from checkpoint, the shape in current model is torch.Size([98]).
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.

In [3]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import torch
import re
from datasets import load_dataset, Audio
import evaluate

wer = evaluate.load('wer')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load processor and model
processor = Wav2Vec2Processor.from_pretrained("khanhld/wav2vec2-base-vietnamese-160h")
model = Wav2Vec2ForCTC.from_pretrained("khanhld/wav2vec2-base-vietnamese-160h")
model.to(device)
model.eval()

# Load dataset
test_dataset = load_dataset("mozilla-foundation/common_voice_8_0", "vi", split="test", use_auth_token="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")
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))
chars_to_ignore = r'[,?.!\-;:"“%\'�]' # ignore special characters

# preprocess data
def preprocess(batch):
  audio = batch["audio"]
  batch["input_values"] = audio["array"]
  batch["transcript"] = re.sub(chars_to_ignore, '', batch["sentence"]).lower()
  return batch

# run inference
def inference(batch):
  input_values = processor(batch["input_values"], 
                            sampling_rate=16000, 
                            return_tensors="pt").input_values
  logits = model(input_values.to(device)).logits
  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_transcript"] = processor.batch_decode(pred_ids) 
  return batch
  
test_dataset = test_dataset.map(preprocess)
result = test_dataset.map(inference, batched=True, batch_size=1)
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_transcript"], references=result["transcript"])))

DatasetNotFoundError: Dataset 'mozilla-foundation/common_voice_8_0' is a gated dataset on the Hub. You must be authenticated to access it.