In [7]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, BertTokenizer, BertModel , Wav2Vec2Model
import librosa

In [6]:

# 모델과 프로세서 불러오기

processor = Wav2Vec2Processor.from_pretrained("kresnik/wav2vec2-large-xlsr-korean")
model = Wav2Vec2ForCTC.from_pretrained("kresnik/wav2vec2-large-xlsr-korean").to('cuda')

# WAV 파일 로드하기
def load_wav(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    return waveform, sample_rate

# 음성 데이터 전처리 및 벡터화 함수
def vectorize_audio(file_path):
    # 음성 파일 로드
    waveform, sample_rate = load_wav(file_path)

    # 배치 차원 추가
    waveform = waveform.squeeze()  # [channels, sequence_length]
    if len(waveform.shape) == 1:
        waveform = waveform.unsqueeze(0)  # [1, sequence_length]
    
    # 프로세서로 음성 데이터 전처리
    inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True).input_values

    inputs = inputs.cuda()
    
    # 모델을 사용하여 음성 데이터를 벡터화
    with torch.no_grad():
        outputs = model(inputs).logits
    
    # 벡터화된 음성 데이터
    return outputs.last_hidden_state

# 예제 파일 벡터화
file_path = '../data2/KsponSpeech/sample.wav'
audio_vectors = vectorize_audio(file_path)

print(audio_vectors)


Some weights of the model checkpoint at kresnik/wav2vec2-large-xlsr-korean were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at kresnik/wav2vec2-large-xlsr-korean and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRA

RuntimeError: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [1, 1, 1, 197120]

### 음성 텍스트 추출

In [8]:
# 모델과 프로세서 로드
model_name = "kresnik/wav2vec2-large-xlsr-korean"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

# 오디오 파일 로드
audio_file_path = "../data2/test/test.wav"
audio, rate = librosa.load(audio_file_path, sr=16000)

# 입력값 준비
input_values = processor(audio, return_tensors="pt", sampling_rate=16000).input_values

# 모델 예측
with torch.no_grad():
    logits = model(input_values).logits

# 예측된 텍스트 아이디 디코딩
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])

transcription

Some weights of the model checkpoint at kresnik/wav2vec2-large-xlsr-korean were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at kresnik/wav2vec2-large-xlsr-korean and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRA

'우빠 어제 쓸먹고 또 사 고쳤어'

### 음성 벡터화

In [11]:
# 모델과 프로세서 로드
model_name = "kresnik/wav2vec2-large-xlsr-korean"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

# 오디오 파일 로드
audio_file_path = "../data2/test/test.wav"
audio, rate = librosa.load(audio_file_path, sr=16000)

# 입력값 준비
input_values = processor(audio, return_tensors="pt", sampling_rate=16000).input_values

# 모델을 사용하여 벡터 추출
with torch.no_grad():
    outputs = model(input_values)
    # 벡터화된 특징 추출
    last_hidden_states = outputs.last_hidden_state

last_hidden_states.shape, last_hidden_states

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at kresnik/wav2vec2-large-xlsr-korean and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(torch.Size([1, 153, 1024]),
 tensor([[[-0.2751,  1.0104,  0.7403,  ...,  0.5307, -0.7481, -1.0759],
          [-0.3279,  0.7733,  0.8325,  ...,  0.8916, -0.8439, -1.0155],
          [-0.3528,  0.5103,  0.9029,  ...,  0.9676, -0.9143, -0.9952],
          ...,
          [-0.9939,  0.9068,  0.8248,  ..., -0.5836,  0.1763, -2.0313],
          [-0.8239,  1.0928,  0.8348,  ..., -0.5341,  0.1182, -1.9940],
          [ 0.4290,  1.4947,  0.6535,  ...,  0.5155,  0.4039, -1.8304]]]))