In [1]:
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio


  from .autonotebook import tqdm as notebook_tqdm


In [5]:


def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    return waveform, sample_rate

def preprocess_audio(waveform, processor, sample_rate):
    # 모델에 맞는 샘플링 레이트로 리샘플링 필요 시 진행
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
    # 차원 조정: [1, Length] 형태로 조정
    waveform = waveform.squeeze(0)  # 첫 번째 차원이 1이면 제거
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
    return inputs


def extract_features_and_text(model, inputs, processor):
    with torch.no_grad():
        # 입력 차원 확인 및 조정
        input_values = inputs.input_values.squeeze()  # 차원 축소가 필요할 경우
        logits = model(input_values).logits  # 모델에 입력
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    return logits, transcription


# 파일 경로 설정
file_path = './output/combined_audio_5.wav'

# 모델과 프로세서 로드
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h").to('cuda')

# 오디오 로드 및 전처리
waveform, sample_rate = load_audio(file_path)
inputs = preprocess_audio(waveform, processor,sample_rate)

# 특징 추출 및 텍스트 변환
features, transcription = extract_features_and_text(model, inputs, processor)

print("Transcription:", transcription)
print("Features shape:", features.shape)


Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s

RuntimeError: Given groups=1, weight of size [512, 1, 10], expected input[1, 534480, 1] to have 1 channels, but got 534480 channels instead

In [7]:
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio


Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s

RuntimeError: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [1, 1, 1, 534480]

In [None]:

def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    return waveform, sample_rate

def preprocess_audio(waveform, processor, sample_rate):
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
    # [1, Length] 형태로 변경
    waveform = waveform.squeeze()  # 불필요한 차원 제거
    if waveform.ndim == 1:
        waveform = waveform.unsqueeze(0)  # 배치 차원 추가
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
    return inputs

def extract_features_and_text(model, inputs):
    with torch.no_grad():
        # inputs.input_values는 [batch size, sequence length] 형태여야 함
        logits = model(inputs.input_values).logits  # 차원 문제 수정
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    return logits, transcription

# 파일 경로 및 모델 로딩
file_path = './output/combined_audio_5.wav'
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# 오디오 로드 및 전처리
waveform, sample_rate = load_audio(file_path)
inputs = preprocess_audio(waveform, processor, sample_rate)

# 특징 추출 및 텍스트 변환
features, transcription = extract_features_and_text(model, inputs)

print("Transcription:", transcription)
print("Features shape:", features.shape)
