Creacion de dataset con Hugging Face

In [1]:
from datasets import Dataset
import evaluate
import pandas as pd
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("audio_transcripcion.csv", sep=";", encoding="utf-8")
print(df.head())

                            path  \
0  common_voice_quy_41903591.wav   
1  common_voice_quy_41903592.wav   
2  common_voice_quy_41903593.wav   
3  common_voice_quy_41903594.wav   
4  common_voice_quy_41903595.wav   

                                            sentence  
0              Akchiyqa kay puykunamantam paqarimun.  
1  Utqayllam wiñanku, sisarinku, hinaspa pawayta ...  
2         Maynintam chayayman, puquio llaqtallayman.  
3  Manaña sachakuna kaptin, pachaqa chinkariyta q...  
4             Kaykunam hukniraq kallpasapa yurakuna.  


In [3]:
import os

AUDIO_DIR = "segmentacion_vad"

df["path"] = df["path"].apply(lambda x: os.path.join(AUDIO_DIR, x))

In [4]:
dataset = Dataset.from_pandas(df)

In [5]:
dataset = dataset.train_test_split(test_size=0.1)
train_ds, test_ds = dataset["train"], dataset["test"]

In [6]:
def clean_text(example):
    text = example["sentence"].lower()
    text = re.sub(r"[^a-zñáéíóúü\s]", "", text)
    example["sentence"] = text.strip()
    return example

In [7]:
train_ds = train_ds.map(clean_text)
test_ds = test_ds.map(clean_text)

Map: 100%|██████████| 1153/1153 [00:00<00:00, 2366.24 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2577.91 examples/s]


Tokenizador + Procesador

In [8]:
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2Processor

In [9]:
all_text = " ".join(train_ds["sentence"])
vocab = list(set(all_text))
vocab_dict = {ch: i for i, ch in enumerate(sorted(vocab))}
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [10]:
import json
with open("vocab.json", "w", encoding="utf-8") as vocab_file:
    json.dump(vocab_dict, vocab_file, ensure_ascii=False)

In [11]:
tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [12]:
from transformers import Wav2Vec2FeatureExtractor
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True)

In [13]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

Preparación de datos

In [14]:
import librosa

def prepare_dataset(batch):
    speech_array, sampling_rate = librosa.load(batch["path"], sr=16000)

    batch["input_values"] = processor(speech_array, sampling_rate=16000).input_values[0]

    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids

    return batch

In [15]:
train_ds = train_ds.map(prepare_dataset, remove_columns=train_ds.column_names)
test_ds = test_ds.map(prepare_dataset, remove_columns=test_ds.column_names)

Map: 100%|██████████| 1153/1153 [01:58<00:00,  9.71 examples/s]
Map: 100%|██████████| 129/129 [00:04<00:00, 29.12 examples/s]


Entrenamiento

In [16]:
import torch
from transformers import Wav2Vec2ForCTC, TrainingArguments, Trainer

In [17]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    vocab_size=len(tokenizer)
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import accelerate
print(accelerate.__version__)
print(accelerate.__file__)

1.10.1
C:\Users\user\anaconda3\envs\asr-quechua\lib\site-packages\accelerate\__init__.py


In [31]:
training_args = TrainingArguments(
    output_dir="./asr_quechua",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    save_strategy="steps",
    num_train_epochs=5,
    fp16=False,
    save_total_limit=2,
    logging_steps=50,
    eval_steps=100,
    save_steps=100,
    learning_rate=3e-4,
    weight_decay=0.005,
)

In [32]:
wer_metric = evaluate.load("wer")

In [33]:
def compute_metrics(pred):
    pred_ids = torch.argmax(torch.tensor(pred.predictions), dim=-1)
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    return {"wer": wer_metric.compute(predictions=pred_str, references=label_str)}

In [34]:
import torch

class DataCollatorCTCWithPadding:
    def __init__(self, processor, padding=True):
        self.processor = processor
        self.padding = padding

    def __call__(self, features):

        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]

        batch_inputs = self.processor.feature_extractor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt"
        )

        with self.processor.as_target_processor():
            batch_labels = self.processor.tokenizer.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt"
            )

        labels = batch_labels["input_ids"].masked_fill(
            batch_labels.attention_mask.ne(1), -100
        )

        batch = {
            "input_values": batch_inputs["input_values"],
            "labels": labels
        }
        return batch


In [35]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=processor.feature_extractor,
)

  trainer = Trainer(


In [None]:
trainer.train()