In [4]:
import os 
import torchaudio
from datasets import Dataset, DatasetDict
from transformers import Wav2Vec2Processor, Wav2Vec2ForPreTraining, AdamW, get_scheduler
import numpy as np
import torch

#### 데이터셋 준비

In [20]:
# 파일 경로 설정 
data_dir = '../감정 분류를 위한 대화 음성 데이터셋/5차년도/5차년도 2'
wav_files = [os.path.join(data_dir,f) for f in os.listdir(data_dir) if f.endswith('.wav')]

# 데이터셋 생성 함수
def load_wav_file(file_path):
    speech_array, sampling_rate = torchaudio.load(file_path)
    return {"speech": speech_array.squeeze().numpy(), "sampling_rate": sampling_rate}

# 데이터셋 생성
data = [load_wav_file(file) for file in wav_files]
dataset = Dataset.from_dict({'audio':data})

# 데이터셋 분할
dataset = dataset.train_test_split(test_size = 0.2)
dataset_dict = DatasetDict({"train": dataset['train'], 'test': dataset['test']})

In [21]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio'],
        num_rows: 80
    })
    test: Dataset({
        features: ['audio'],
        num_rows: 20
    })
})

#### 모델 및 프로세서 설정

In [22]:
processor = Wav2Vec2Processor.from_pretrained("kresnik/wav2vec2-large-xlsr-korean")
model = Wav2Vec2ForPreTraining.from_pretrained("kresnik/wav2vec2-large-xlsr-korean").to('cuda')

Some weights of Wav2Vec2ForPreTraining were not initialized from the model checkpoint at kresnik/wav2vec2-large-xlsr-korean and are newly initialized: ['project_hid.bias', 'project_hid.weight', 'project_q.bias', 'project_q.weight', 'quantizer.codevectors', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### 데이터 전처리 및 데이터 로더 설정


In [23]:
def prepare_dataset(batch):
    audio = batch['audio']
    batch['input_values'] = processor(audio['speech'], sampling_rate = audio['sampling_rate'], return_tensors = 'pt').input_values[0]
    return batch

# 데이터 전처리
dataset = dataset_dict.map(prepare_dataset, remove_columns=dataset_dict['train'].column_names)

# 데이터 로더 설정
from torch.utils.data import DataLoader

train_dataloader = DataLoader(dataset['train'], batch_size=4, shuffle=True)
test_dataloader = DataLoader(dataset['test'], batch_size=4)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]


ValueError: The model corresponding to this feature extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}
 was trained using a sampling rate of 16000. Please make sure that the provided `raw_speech` input was sampled with 16000 and not 48000.

#### 학습 설정 및 학습 루프

In [None]:
# 옵티마이저 설정
optimizer = AdamW(model.parameters(), lr=1e-4)

# 학습 스케줄러 설정
num_training_steps = len(train_dataloader) * 10 # 예를 들어 10 epochs
lr_scheduler = get_scheduler(
    name='linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# 모델 학습 루프
model.train()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(10):  # 예를 들어 10 epochs
    for batch in train_dataloader:
        inputs = batch["input_values"].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
    print(f"Epoch {epoch + 1} completed")

print("Model pre-training complete.")

In [24]:
import os
import torchaudio
from datasets import Dataset, DatasetDict
from transformers import Wav2Vec2Processor
import torchaudio.transforms as T

# 파일 경로 설정
data_dir = "../감정 분류를 위한 대화 음성 데이터셋/5차년도/5차년도 2"
wav_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.wav')]

# 모델과 프로세서 로드
model_name = "kresnik/wav2vec2-large-xlsr-korean"
processor = Wav2Vec2Processor.from_pretrained(model_name)

# 리샘플러 설정
resampler = T.Resample(orig_freq=48000, new_freq=16000)

def load_wav_file(file_path):
    speech_array, sampling_rate = torchaudio.load(file_path)
    
    # 샘플링 레이트가 16kHz가 아닌 경우 리샘플링
    if sampling_rate != 16000:
        speech_array = resampler(speech_array)
    
    return {"speech": speech_array.squeeze().numpy(), "sampling_rate": 16000}

# 데이터셋 생성
data = [load_wav_file(file) for file in wav_files]
dataset = Dataset.from_dict({"audio": data})

# 데이터셋 분할
dataset = dataset.train_test_split(test_size=0.1)
dataset_dict = DatasetDict({"train": dataset["train"], "test": dataset["test"]})

# 데이터 전처리 함수
def prepare_dataset(batch):
    audio = batch['audio']
    batch['input_values'] = processor(audio['speech'], sampling_rate=audio['sampling_rate'], return_tensors='pt').input_values.squeeze(0)
    return batch

# 데이터 전처리
dataset_dict = dataset_dict.map(prepare_dataset, remove_columns=dataset_dict['train'].column_names)

# 데이터 로더 설정
from torch.utils.data import DataLoader

train_dataloader = DataLoader(dataset_dict['train'], batch_size=4, shuffle=True)
test_dataloader = DataLoader(dataset_dict['test'], batch_size=4)

# 모델 로드
from transformers import Wav2Vec2ForPreTraining

model = Wav2Vec2ForPreTraining.from_pretrained(model_name)

# 옵티마이저 및 학습 스케줄러 설정
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=1e-4)
num_training_steps = len(train_dataloader) * 10  # 예를 들어 10 epochs
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# 모델 학습 루프
model.train()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(10):  # 예를 들어 10 epochs
    for batch in train_dataloader:
        inputs = batch['input_values'].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
    print(f"Epoch {epoch + 1} completed")

print("Model pre-training complete.")


Map: 100%|██████████| 90/90 [00:05<00:00, 17.02 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 18.45 examples/s]
Some weights of Wav2Vec2ForPreTraining were not initialized from the model checkpoint at kresnik/wav2vec2-large-xlsr-korean and are newly initialized: ['project_hid.bias', 'project_hid.weight', 'project_q.bias', 'project_q.weight', 'quantizer.codevectors', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: each element in list of batch should be of equal size

In [26]:
import os
import torchaudio
from datasets import Dataset, DatasetDict
from transformers import Wav2Vec2Processor
import torchaudio.transforms as T
from torch.utils.data import DataLoader

# 파일 경로 설정
data_dir = "../감정 분류를 위한 대화 음성 데이터셋/5차년도/5차년도 2"
wav_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.wav')]

# 모델과 프로세서 로드
model_name = "kresnik/wav2vec2-large-xlsr-korean"
processor = Wav2Vec2Processor.from_pretrained(model_name)

# 리샘플러 설정
resampler = T.Resample(orig_freq=48000, new_freq=16000)

def load_wav_file(file_path):
    speech_array, sampling_rate = torchaudio.load(file_path)
    
    # 샘플링 레이트가 16kHz가 아닌 경우 리샘플링
    if sampling_rate != 16000:
        speech_array = resampler(speech_array)
    
    return {"speech": speech_array.squeeze().numpy(), "sampling_rate": 16000}

# 데이터셋 생성
data = [load_wav_file(file) for file in wav_files]
dataset = Dataset.from_dict({"audio": data})

# 데이터셋 분할
dataset = dataset.train_test_split(test_size=0.1)
dataset_dict = DatasetDict({"train": dataset["train"], "test": dataset["test"]})

# 데이터 전처리 함수
def prepare_dataset(batch):
    audio = batch['audio']
    batch['input_values'] = processor(audio['speech'], sampling_rate=audio['sampling_rate'], return_tensors='pt').input_values[0]
    return batch

# 데이터 전처리
dataset_dict = dataset_dict.map(prepare_dataset, remove_columns=dataset_dict['train'].column_names)

# 패딩 함수
def collate_fn(batch):
    input_values = [item['input_values'] for item in batch]
    padded_input_values = processor.pad(input_values, return_tensors='pt', padding='longest').input_values
    return {'input_values': padded_input_values}

# 데이터 로더 설정
train_dataloader = DataLoader(dataset_dict['train'], batch_size=4, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(dataset_dict['test'], batch_size=4, collate_fn=collate_fn)

# 모델 로드
from transformers import Wav2Vec2ForPreTraining

model = Wav2Vec2ForPreTraining.from_pretrained(model_name)

# 옵티마이저 및 학습 스케줄러 설정
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=1e-4)
num_training_steps = len(train_dataloader) * 10  # 예를 들어 10 epochs
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# 모델 학습 루프
model.train()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(10):  # 예를 들어 10 epochs
    for batch in train_dataloader:
        inputs = batch['input_values'].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
    print(f"Epoch {epoch + 1} completed")

print("Model pre-training complete.")


Map: 100%|██████████| 90/90 [00:05<00:00, 17.95 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 20.32 examples/s]
Some weights of Wav2Vec2ForPreTraining were not initialized from the model checkpoint at kresnik/wav2vec2-large-xlsr-korean and are newly initialized: ['project_hid.bias', 'project_hid.weight', 'project_q.bias', 'project_q.weight', 'quantizer.codevectors', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'list' object has no attribute 'keys'

In [28]:
import os
import torchaudio
from datasets import Dataset, DatasetDict
from transformers import Wav2Vec2Processor
import torchaudio.transforms as T
from torch.utils.data import DataLoader

# 파일 경로 설정
data_dir = "../감정 분류를 위한 대화 음성 데이터셋/5차년도/5차년도 2"
wav_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.wav')]

# 모델과 프로세서 로드
model_name = "kresnik/wav2vec2-large-xlsr-korean"
processor = Wav2Vec2Processor.from_pretrained(model_name)

# 리샘플러 설정
resampler = T.Resample(orig_freq=48000, new_freq=16000)

def load_wav_file(file_path):
    speech_array, sampling_rate = torchaudio.load(file_path)
    
    # 샘플링 레이트가 16kHz가 아닌 경우 리샘플링
    if sampling_rate != 16000:
        speech_array = resampler(speech_array)
    
    return {"speech": speech_array.squeeze().numpy(), "sampling_rate": 16000}

# 데이터셋 생성
data = [load_wav_file(file) for file in wav_files]
dataset = Dataset.from_dict({"audio": data})

# 데이터셋 분할
dataset = dataset.train_test_split(test_size=0.1)
dataset_dict = DatasetDict({"train": dataset["train"], "test": dataset["test"]})

# 데이터 전처리 함수
def prepare_dataset(batch):
    audio = batch['audio']
    batch['input_values'] = processor(audio['speech'], sampling_rate=audio['sampling_rate'], return_tensors='pt').input_values[0]
    return batch

# 데이터 전처리
dataset_dict = dataset_dict.map(prepare_dataset, remove_columns=dataset_dict['train'].column_names)

# 패딩 함수
def collate_fn(batch):
    input_values = [item['input_values'] for item in batch]
    padded_input_values = processor.pad(input_values, return_tensors='pt', padding='longest').input_values
    return {'input_values': padded_input_values}

# 데이터 로더 설정
train_dataloader = DataLoader(dataset_dict['train'], batch_size=4, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(dataset_dict['test'], batch_size=4, collate_fn=collate_fn)

# 모델 로드
from transformers import Wav2Vec2ForPreTraining
import torch

model = Wav2Vec2ForPreTraining.from_pretrained(model_name)

# 옵티마이저 및 학습 스케줄러 설정
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=1e-4)
num_training_steps = len(train_dataloader) * 10  # 예를 들어 10 epochs
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)


Map: 100%|██████████| 90/90 [00:05<00:00, 17.31 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 19.97 examples/s]
Some weights of Wav2Vec2ForPreTraining were not initialized from the model checkpoint at kresnik/wav2vec2-large-xlsr-korean and are newly initialized: ['project_hid.bias', 'project_hid.weight', 'project_q.bias', 'project_q.weight', 'quantizer.codevectors', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x23c04ab9fa0>

In [34]:

# 모델 학습 루프
model.train()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(10):  # 예를 들어 10 epochs
    for batch in train_dataloader:
        inputs = batch['input_values'].to(device)
        optimizer.zero_grad()
        outputs = model(inputs, output_hidden_states=True)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
    print(f"Epoch {epoch + 1} completed")

print("Model pre-training complete.")


AttributeError: 'list' object has no attribute 'keys'

In [37]:
for i in train_dataloader:
    print(i)

AttributeError: 'list' object has no attribute 'keys'