In [None]:
import os
import torch
import scipy
from datetime import datetime
from diffusers import AudioLDM2Pipeline
from dotenv import load_dotenv
from huggingface_hub import whoami

## 1. Audioldm2

In [None]:
load_dotenv(dotenv_path=os.path.join("..", ".env"))
token = os.getenv("HUGGING_FACE_HUB_TOKEN")

# 실제 인증 테스트
print("🔍 인증된 사용자 정보:")
whoami(token=token)

In [10]:
import scipy
import torch
from diffusers import AudioLDM2Pipeline

# load the pipeline
repo_id = "cvssp/audioldm2"
pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
pipe = pipe.to("cuda")

# define the prompts
prompt = "Soft thuds of moist soil being gently pressed as small seeds are planted, accompanied by subtle rustling of leaves and distant chirping of birds in a quiet garden."
negative_prompt = "Low quality."

# set the seed
generator = torch.Generator("cuda").manual_seed(0)

# run the generation
audio = pipe(
    prompt,
    negative_prompt=negative_prompt,
    num_inference_steps=200,
    audio_length_in_s=3.0,
    num_waveforms_per_prompt=3,
).audios

# save the best audio sample (index 0) as a .wav file
scipy.io.wavfile.write("seeds_test3.wav", rate=16000, data=audio[0])

Loading pipeline components...: 100%|██████████| 11/11 [00:02<00:00,  4.88it/s]
Expected types for language_model: (<class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>,), got <class 'transformers.models.gpt2.modeling_gpt2.GPT2Model'>.
100%|██████████| 200/200 [04:37<00:00,  1.39s/it]


## 2. MusicGen

In [None]:
# !pip install -U git+https://github.com/facebookresearch/audiocraft#egg=audiocraft

In [None]:
import torchaudio
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write

# 모델 로드
model = MusicGen.get_pretrained('facebook/musicgen-large')
model.set_generation_params(duration=8)  # 8초 길이의 오디오 생성

# 프롬프트 설정
descriptions = ['lo-fi music with a soothing melody']

# 오디오 생성
wav = model.generate(descriptions)

# 오디오 저장
for idx, one_wav in enumerate(wav):
    audio_write(f'{idx}.wav', one_wav.cpu(), model.sample_rate, strategy="loudness")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
CLIPPING 0.wav happening with proba (a bit of clipping is okay): 0.0017812500009313226 maximum scale:  1.1532424688339233


In [16]:
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write

# 모델 로드 (사전 학습된 large 모델)
model = MusicGen.get_pretrained('facebook/musicgen-large')
model.set_generation_params(duration=8)  # 생성할 길이 (초)

# 🎯 프롬프트
descriptions = ["The ambient sound of a kitchen stew gently boiling, with soft bubbling and simmering, cozy atmosphere"]

# 오디오 생성
wav = model.generate(descriptions)

# 저장
audio_write("musicgen_stew_bubbling", wav[0].cpu(), model.sample_rate, strategy="loudness")
print("✅ 저장 완료: musicgen_stew_bubbling.wav")


  return torch.load(file, map_location=device)
  return torch.load(file, map_location=device)
  WeightNorm.apply(module, name, dim)


✅ 저장 완료: musicgen_stew_bubbling.wav


CLIPPING musicgen_stew_bubbling happening with proba (a bit of clipping is okay): 0.0062890625558793545 maximum scale:  1.4899888038635254


## 3. AudioGen

In [None]:
# import torchaudio
# from audiocraft.models import AudioGen
# from audiocraft.data.audio import audio_write

# # 모델 로드
# model = AudioGen.get_pretrained('facebook/audiogen-medium')
# model.set_generation_params(duration=5)  # 5초 길이의 오디오 생성

# # 프롬프트 설정
# descriptions = ['planting seeds', '']

# # 오디오 생성
# wav = model.generate(descriptions)

# # 오디오 저장
# for idx, one_wav in enumerate(wav):
#     audio_write(f'{idx}.wav', one_wav.cpu(), model.sample_rate, strategy="loudness")

In [14]:
# from audiocraft.models import AudioGen
# from audiocraft.data.audio import audio_write

# # 모델 로드
# model = AudioGen.get_pretrained("facebook/audiogen-medium")

# # 생성할 오디오 길이 설정
# model.set_generation_params(duration=4)

# # 프롬프트
# prompt = [
#     "Soft thuds of moist soil being gently pressed as small seeds are planted, accompanied by subtle rustling of leaves and distant chirping of birds in a quiet garden."
# ]

# # 오디오 생성
# wav_outputs = model.generate(prompt)

# # 저장
# audio_write("planting_seeds_test", wav_outputs[0].cpu(), model.sample_rate, strategy="loudness")
# print("✅ 오디오 생성 완료!")

from audiocraft.models import AudioGen
from audiocraft.data.audio import audio_write
import torch

# 모델 로드
model = AudioGen.get_pretrained("facebook/audiogen-medium")
model.set_generation_params(duration=3)

# 프롬프트
prompt = ["In-game cooking sound of a hot stew boiling and bubbling in a pot, with subtle steam hissing and soft pops, perfect for a cozy kitchen environment in a casual simulation game."]

# 오디오 생성 & 저장 (4개)
for i in range(4):
    wav = model.generate(prompt)[0].cpu()
    filename = f"boiling_pot_v{i+1}"
    audio_write(filename, wav, model.sample_rate, strategy="loudness")
    print(f"✅ 저장 완료: {filename}")

  return torch.load(file, map_location=device)
  WeightNorm.apply(module, name, dim)


✅ 저장 완료: boiling_pot_v1
✅ 저장 완료: boiling_pot_v2


CLIPPING boiling_pot_v3 happening with proba (a bit of clipping is okay): 0.2407708317041397 maximum scale:  4.6316046714782715


✅ 저장 완료: boiling_pot_v3
✅ 저장 완료: boiling_pot_v4


In [1]:
import torch
print("🔥 CUDA 가능?", torch.cuda.is_available())
print("🧠 GPU 사용 중:", torch.cuda.get_device_name(0))

🔥 CUDA 가능? True
🧠 GPU 사용 중: NVIDIA GeForce RTX 4070 Laptop GPU


## 4. DiffSound