In [None]:
import os
import torch
import scipy
from datetime import datetime
from diffusers import AudioLDM2Pipeline
from dotenv import load_dotenv

## 1. Audioldm2

In [None]:
# load_dotenv(dotenv_path=os.path.join("..", ".env"))
# token = os.getenv("HUGGING_FACE_HUB_TOKEN")

# # 실제 인증 테스트
# print("🔍 인증된 사용자 정보:")
# whoami(token=token)

In [None]:
from audiocraft.models import AudioGen
from audiocraft.data.audio import audio_write
import torch

# 모델 로드
model = AudioGen.get_pretrained("facebook/audiogen-medium")
model.set_generation_params(duration=5)

# 프롬프트 (구체적 묘사, 기계 소음 배제, 자연스러운 묘사)
prompt = [
    "A person digging soil with a metal shovel, planting seeds, covering them gently with soil, "
    "natural outdoor environment, no mechanical noises, birds chirping softly far away."
]

# 오디오 생성 & 저장 (4개 생성)
for i in range(4):
    wav = model.generate(prompt)[0].cpu()
    filename = f"planting_seeds_v{i+1}"
    audio_write(filename, wav, model.sample_rate, strategy="loudness")
    print(f"✅ 저장 완료: {filename}")

In [None]:
import scipy
import torch
from diffusers import AudioLDM2Pipeline

# load the pipeline
repo_id = "cvssp/audioldm2"
pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
pipe = pipe.to("cuda")

# define the prompts
prompt = "Soft thuds of moist soil being gently pressed as small seeds are planted, accompanied by subtle rustling of leaves and distant chirping of birds in a quiet garden."
negative_prompt = "Low quality."

# set the seed
generator = torch.Generator("cuda").manual_seed(0)

# run the generation
audio = pipe(
    prompt,
    negative_prompt=negative_prompt,
    num_inference_steps=200,
    audio_length_in_s=3.0,
    num_waveforms_per_prompt=3,
).audios

# save the best audio sample (index 0) as a .wav file
scipy.io.wavfile.write("seeds_test3.wav", rate=16000, data=audio[0])

## 2. MusicGen

In [None]:
# !pip install -U git+https://github.com/facebookresearch/audiocraft#egg=audiocraft

In [None]:
import torchaudio
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write

# 모델 로드
model = MusicGen.get_pretrained('facebook/musicgen-large')
model.set_generation_params(duration=8)  # 8초 길이의 오디오 생성

# 프롬프트 설정
descriptions = ['lo-fi music with a soothing melody']

# 오디오 생성
wav = model.generate(descriptions)

# 오디오 저장
for idx, one_wav in enumerate(wav):
    audio_write(f'{idx}.wav', one_wav.cpu(), model.sample_rate, strategy="loudness")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
CLIPPING 0.wav happening with proba (a bit of clipping is okay): 0.0017812500009313226 maximum scale:  1.1532424688339233


In [None]:
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write

# 모델 로드 (사전 학습된 large 모델)
model = MusicGen.get_pretrained('facebook/musicgen-large')
model.set_generation_params(duration=8)  # 생성할 길이 (초)

# 🎯 프롬프트
descriptions = ["The ambient sound of a kitchen stew gently boiling, with soft bubbling and simmering, cozy atmosphere"]

# 오디오 생성
wav = model.generate(descriptions)

# 저장
audio_write("musicgen_stew_bubbling", wav[0].cpu(), model.sample_rate, strategy="loudness")
print("✅ 저장 완료: musicgen_stew_bubbling.wav")


## 3. AudioGen

In [None]:
# import torchaudio
# from audiocraft.models import AudioGen
# from audiocraft.data.audio import audio_write

# # 모델 로드
# model = AudioGen.get_pretrained('facebook/audiogen-medium')
# model.set_generation_params(duration=5)  # 5초 길이의 오디오 생성

# # 프롬프트 설정
# descriptions = ['planting seeds', '']

# # 오디오 생성
# wav = model.generate(descriptions)

# # 오디오 저장
# for idx, one_wav in enumerate(wav):
#     audio_write(f'{idx}.wav', one_wav.cpu(), model.sample_rate, strategy="loudness")

In [6]:
# from audiocraft.models import AudioGen
# from audiocraft.data.audio import audio_write

# # 모델 로드
# model = AudioGen.get_pretrained("facebook/audiogen-medium")

# # 생성할 오디오 길이 설정
# model.set_generation_params(duration=4)

# # 프롬프트
# prompt = [
#     "Soft thuds of moist soil being gently pressed as small seeds are planted, accompanied by subtle rustling of leaves and distant chirping of birds in a quiet garden."
# ]

# # 오디오 생성
# wav_outputs = model.generate(prompt)

# # 저장
# audio_write("planting_seeds_test", wav_outputs[0].cpu(), model.sample_rate, strategy="loudness")
# print("✅ 오디오 생성 완료!")

from audiocraft.models import AudioGen
from audiocraft.data.audio import audio_write
import torch

# ✅ 모델 로드
model = AudioGen.get_pretrained("facebook/audiogen-medium")

# ✅ 생성 품질 세팅 (sampling params)
model.set_generation_params(
    duration=5,         # 생성할 오디오 길이 (초)
    temperature=0.7,    # 창의성 정도 (낮출수록 안정적)
    top_p=0.9,          # 확률적 샘플링 범위 (낮출수록 일관성↑)
)

# ✅ 프롬프트 (훨씬 구체적으로 개선)
prompt = [
    "digging with a shovel"
]

# ✅ 오디오 생성 & 저장
for i in range(4):
    wav = model.generate(prompt)[0].cpu()
    filename = f"farming_v{i+1}"
    audio_write(filename, wav, model.sample_rate, strategy="loudness")
    print(f"✅ 저장 완료: {filename}")




✅ 저장 완료: farming_v1
✅ 저장 완료: farming_v2


CLIPPING farming_v3 happening with proba (a bit of clipping is okay): 0.00039999998989515007 maximum scale:  1.8202755451202393


✅ 저장 완료: farming_v3
✅ 저장 완료: farming_v4


In [1]:
import torch
print("🔥 CUDA 가능?", torch.cuda.is_available())
print("🧠 GPU 사용 중:", torch.cuda.get_device_name(0))

🔥 CUDA 가능? True
🧠 GPU 사용 중: NVIDIA GeForce RTX 4070 Laptop GPU


# 8. ezaudio

In [4]:
# 설치
!git clone https://github.com/haidog-yaqub/EzAudio.git
%cd EzAudio
!pip install -r requirements.txt

# 예시 코드
from api.ezaudio import EzAudio
import torch
import soundfile as sf

device = 'cuda' if torch.cuda.is_available() else 'cpu'
ezaudio = EzAudio(model_name='s3_xl', device=device)

prompt = "A dog barking in the distance"
sr, audio = ezaudio.generate_audio(prompt)
sf.write(f'{prompt}.wav', audio, sr)


c:\Users\user\Documents\GitHub\ProjectISG-AI\Notebooks\JKL\soundctm\soundctm\EzAudio


Cloning into 'EzAudio'...


Collecting alias_free_torch (from -r requirements.txt (line 1))
  Using cached alias_free_torch-0.0.6-py3-none-any.whl.metadata (3.8 kB)
Collecting vector_quantize_pytorch (from -r requirements.txt (line 10))
  Using cached vector_quantize_pytorch-1.22.15-py3-none-any.whl.metadata (30 kB)
Collecting julius (from -r requirements.txt (line 12))
  Using cached julius-0.2.7.tar.gz (59 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting torch_stoi (from -r requirements.txt (line 13))
  Using cached torch_stoi-0.2.3-py3-none-any.whl.metadata (3.6 kB)
Collecting flatten-dict (from -r requirements.txt (line 14))
  Using cached flatten_dict-0.4.2-py2.py3-none-any.whl.metadata (9.2 kB)
Collecting tensorboard

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


ImportError: cannot import name 'cached_download' from 'huggingface_hub' (c:\Users\user\Documents\GitHub\sound_310\lib\site-packages\huggingface_hub\__init__.py)