### Wav2Vec2-large-xlsr-korean 모델 Pre-training


In [1]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import librosa
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [19]:
class AudioDataset(Dataset):
    def __init__(self, directory):
        self.directory = directory
        self.filenames = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.wav')]
        self.files = []

        for file in tqdm(self.filenames, desc="Loading audio files"):
            audio, _ = librosa.load(file, sr=16000, mono=True)
            self.files.append(torch.tensor(audio, dtype=torch.float32))

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        return self.files[idx]

# 데이터셋 인스턴스화
dataset = AudioDataset('../data/reduce/cuda테스트')  
loader = DataLoader(dataset, batch_size=1, shuffle=True)





Loading audio files: 100%|██████████| 13/13 [00:00<00:00, 276.22it/s]


In [3]:
import numpy
print(numpy.__version__)

1.19.5


In [12]:
from transformers import Wav2Vec2ForPreTraining, Trainer, TrainingArguments

In [20]:
print(transformers.__version__)

4.12.0


In [36]:
#!pip install transformers[torch]

In [37]:
#!pip install accelerate -U

In [38]:
#!pip uninstall accelerate -y

In [39]:
#!pip install accelerate==0.21.0

In [21]:
# 사전 학습된 모델 로드
model = Wav2Vec2ForPreTraining.from_pretrained("kresnik/wav2vec2-large-xlsr-korean")

# Trainer 설정
training_args = TrainingArguments(
    output_dir='./wav2vec2_pretrained',
    per_device_train_batch_size=1,  # GPU 메모리에 따라 조정
    num_train_epochs=10,            # 적절한 에폭 수 설정
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    do_train=True,
    no_cuda=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=lambda data: {'input_values': torch.cat([x.unsqueeze(0) for x in data], 0)},
    train_dataset=dataset
)

# Pre-training 시작
trainer.train()


loading configuration file https://huggingface.co/kresnik/wav2vec2-large-xlsr-korean/resolve/main/config.json from cache at C:\Users\Jws/.cache\huggingface\transformers\d33b22e404661c9d64ae19906d25af36ff93cb444de2ca532ee5f68ebe79440d.de221d5718b1871c71ff30c71be8d85c0e7038148801458ad4ee9b4a4a6e92e7
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_ze

In [None]:
from transformers import Wav2Vec2ForPreTraining, Trainer, TrainingArguments
import torch
from torch import nn


In [16]:

# 사전 학습된 모델 로드
model = Wav2Vec2ForPreTraining.from_pretrained("kresnik/wav2vec2-large-xlsr-korean")

# Trainer 설정
training_args = TrainingArguments(
    output_dir='./wav2vec2_pretrained',
    per_device_train_batch_size=1,
    num_train_epochs=10,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    do_train=True,
    no_cuda=True
)

# 손실 함수 정의 (예시)
def compute_loss(model, inputs):
    outputs = model(**inputs)
    logits = outputs.logits
    labels = inputs['labels']
    # 손실 함수 정의 (CrossEntropyLoss 등을 사용하여 필요한 경우 레이블에 맞게 계산)
    loss_fct = nn.CrossEntropyLoss()
    loss = loss_fct(logits.view(-1, model.config.vocab_size), labels.view(-1))
    return loss

# Trainer 클래스 커스터마이즈
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # 입력에서 'labels' 키를 제거하고 모델로 전달
        labels = inputs.pop("labels", None)
        outputs = model(**inputs)
        # 사용자 정의 손실 계산
        loss = compute_loss(model, inputs={'input_values': inputs['input_values'], 'labels': labels})
        return (loss, outputs) if return_outputs else loss

# 데이터 콜레이터 함수 정의
def data_collator(batch):
    input_values = [x['input_values'] for x in batch]
    labels = [x['labels'] for x in batch]

    # 입력값들이 텐서인지 확인하고, 아니라면 텐서로 변환
    input_values_tensors = [torch.tensor(x).unsqueeze(0) if not isinstance(x, torch.Tensor) else x.unsqueeze(0) for x in input_values]
    labels_tensors = [torch.tensor(x).unsqueeze(0) if not isinstance(x, torch.Tensor) else x.unsqueeze(0) for x in labels]

    # 올바른 차원이 확인된 후 텐서 연결
    input_values_tensors = torch.cat(input_values_tensors, 0)
    labels_tensors = torch.cat(labels_tensors, 0)

    return {'input_values': input_values_tensors, 'labels': labels_tensors}

# Trainer 인스턴스 생성
trainer = CustomTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

# Pre-training 시작
trainer.train()

loading configuration file https://huggingface.co/kresnik/wav2vec2-large-xlsr-korean/resolve/main/config.json from cache at C:\Users\Jws/.cache\huggingface\transformers\d33b22e404661c9d64ae19906d25af36ff93cb444de2ca532ee5f68ebe79440d.de221d5718b1871c71ff30c71be8d85c0e7038148801458ad4ee9b4a4a6e92e7
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_ze

IndexError: too many indices for tensor of dimension 1

In [5]:
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [5]:

if torch.cuda.is_available():
    
    print("CUDA is available. Training on GPU.")
else:
    print("CUDA is not available. Training on CPU.")

CUDA is available. Training on GPU.


In [6]:
print(torch.__version__)

1.8.0+cu111


In [3]:
#!pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio===0.8.0 -f https://download.pytorch.org/whl/torch_stable.html


Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.8.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torch-1.8.0%2Bcu111-cp36-cp36m-win_amd64.whl (3055.7MB)
Collecting torchvision==0.9.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torchvision-0.9.0%2Bcu111-cp36-cp36m-win_amd64.whl (1.9MB)
Collecting torchaudio===0.8.0
  Downloading https://files.pythonhosted.org/packages/32/13/aee1b62921cc615173117d2d3a13e6543f55fd38c809d33589bf14d6693f/torchaudio-0.8.0-cp36-none-win_amd64.whl (109kB)
Collecting pillow>=4.1.1 (from torchvision==0.9.0+cu111)
  Downloading https://files.pythonhosted.org/packages/8f/10/c8dc9fff37b69b5962b7783ab4835611e83dada453cd9913d82ca2a1321b/Pillow-8.4.0-cp36-cp36m-win_amd64.whl (3.2MB)
Installing collected packages: torch, pillow, torchvision, torchaudio
Successfully installed pillow-8.4.0 torch-1.8.0+cu111 torchaudio-0.8.0 torchvision-0.9.0+cu111


You are using pip version 18.1, however version 21.3.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [4]:
!pip list

Package             Version
------------------- -----------
appdirs             1.4.4
audioread           3.0.1
backcall            0.2.0
certifi             2024.7.4
cffi                1.15.1
charset-normalizer  2.0.12
click               8.0.4
colorama            0.4.5
dataclasses         0.8
decorator           5.1.1
entrypoints         0.4
filelock            3.4.1
huggingface-hub     0.4.0
idna                3.7
importlib-metadata  4.8.3
importlib-resources 5.4.0
ipykernel           5.5.6
ipython             7.16.3
ipython-genutils    0.2.0
jedi                0.17.2
joblib              1.1.1
jupyter-client      7.1.2
jupyter-core        4.9.2
librosa             0.9.2
llvmlite            0.36.0
nest-asyncio        1.6.0
numba               0.53.1
numpy               1.19.5
packaging           21.3
parso               0.7.1
pickleshare         0.7.5
Pillow              8.4.0
pip                 21.3.1
pooch               1.6.0
prompt-toolkit      3.0.36
protobuf            3.19.

In [5]:
!pip freeze > a.text