<a href="https://colab.research.google.com/github/UiinKim/Music_Generation/blob/main/Encodec_connected_by_MusicCaps(AudioCodec).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# 필수 패키지 설치
!apt-get install -y sox libsndfile1 ffmpeg
!pip install torch torchvision torchaudio
!pip install datasets yt-dlp gradio
!pip install -U datasets[audio]
!pip install transformers
!pip install huggingface_hub

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libsndfile1 is already the newest version (1.0.31-2ubuntu0.1).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
sox is already the newest version (14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [13]:
import subprocess
import os
from pathlib import Path
from datasets import load_dataset, Audio
import torchaudio
import torch
from transformers import EncodecModel, AutoProcessor

def download_clip(video_identifier, output_filename, start_time, end_time, tmp_dir='/tmp/musiccaps', num_attempts=5, url_base='https://www.youtube.com/watch?v='):
    status = False
    command = f"""
        yt-dlp --quiet --no-warnings -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" {url_base}{video_identifier}
    """.strip()
    attempts = 0
    while True:
        try:
            output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
        except subprocess.CalledProcessError as err:
            attempts += 1
            if attempts == num_attempts:
                return status, err.output
        else:
            break
    status = os.path.exists(output_filename)
    return status, 'Downloaded'

def process(example):
    outfile_path = str(data_dir / f"{example['ytid']}.wav")
    status = True
    if not os.path.exists(outfile_path):
        status = False
        status, log = download_clip(
            example['ytid'],
            outfile_path,
            example['start_s'],
            example['end_s'],
        )
    example['audio'] = outfile_path
    example['download_status'] = status
    return example

samples_to_load = 10
cores = 4
sampling_rate = 44100
writer_batch_size = 100
data_dir = "./music_data"
data_dir = Path(data_dir)
data_dir.mkdir(exist_ok=True, parents=True)

ds = load_dataset('google/MusicCaps', split='train')
ds = ds.select(range(samples_to_load))
ds = ds.map(
        process,
        num_proc=cores,
        writer_batch_size=writer_batch_size,
        keep_in_memory=False
    ).cast_column('audio', Audio(sampling_rate=sampling_rate))


In [14]:
# 모델 및 프로세서 로드
model = EncodecModel.from_pretrained("facebook/encodec_48khz")
processor = AutoProcessor.from_pretrained("facebook/encodec_48khz")

# 데이터셋의 오디오 데이터를 올바른 샘플링 레이트로 캐스팅
ds = ds.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))

# 각 오디오 샘플에 대해 인코딩 및 디코딩 수행
for i in range(len(ds)):
    audio_sample = ds[i]["audio"]["array"]

    # 입력 전처리: 모노에서 스테레오로 변환
    if len(audio_sample.shape) == 1:
        # 모노인 경우 스테레오로 변환
        audio_sample = torch.tensor(audio_sample).unsqueeze(0).repeat(2, 1).numpy()
    elif len(audio_sample.shape) == 2 and audio_sample.shape[0] == 1:
        # 모노 채널로 되어 있는 경우 스테레오로 변환
        audio_sample = audio_sample.repeat(2, axis=0)
    elif len(audio_sample.shape) == 2 and audio_sample.shape[0] == 2:
        # 이미 스테레오인 경우
        pass
    else:
        raise ValueError(f"Unexpected audio shape: {audio_sample.shape}")

    # 입력 전처리
    inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt")

    # 오디오 입력을 명시적으로 인코딩 및 디코딩
    encoder_outputs = model.encode(inputs["input_values"], inputs["padding_mask"])
    audio_values = model.decode(encoder_outputs.audio_codes, encoder_outputs.audio_scales, inputs["padding_mask"])[0]

    # 또는 포워드 패스로 수행
    audio_values = model(inputs["input_values"], inputs["padding_mask"]).audio_values

    # 결과 오디오 값을 출력
    print(f"Processed audio sample {i+1}: {audio_values}")

TypeError: Audio.__init__() got an unexpected keyword argument 'sampling_rate'

In [None]:
from IPython.display import Audio, display