In [None]:
!pwd

In [4]:
import os
import numpy as np
import torch
from pydub import AudioSegment
from tqdm import tqdm
import demucs.pretrained
import torchaudio

# Define paths and settings
raw_dataset_path = "/home/ubuntu/indie-rock-50"  # Path to raw dataset
save_path = "/home/ubuntu/indie-rock_instrumentals"  # Path to save instrumental audio

# Convert MP4 to WAV
def convert_mp4_to_wav(file_path):
    """Convert MP4 file to WAV format"""
    audio = AudioSegment.from_file(file_path)
    wav_filename = file_path.replace('.mp4', '.wav')
    audio.export(wav_filename, format="wav")
    return os.path.basename(wav_filename)

# Initialize Demucs separator for vocal removal
def initialize_separator():
    try:
        separator = demucs.pretrained.get_model('htdemucs')  # Better model for vocal separation
        if torch.cuda.is_available():
            separator = separator.to('cuda')
        return separator
    except Exception as e:
        print(f"Error loading Demucs model: {e}")
        return None

# Remove vocals and return instrumental audio
def remove_vocals(audio_tensor, separator, sample_rate):
    """Remove vocals from audio tensor and return instrumental mix"""
    try:
        from demucs.apply import apply_model
        from demucs.audio import convert_audio
        
        # Convert audio to separator's expected format
        audio_tensor = convert_audio(audio_tensor, sample_rate, 
                                   separator.samplerate, separator.audio_channels)
        
        # Apply separation
        with torch.no_grad():
            stems = apply_model(separator, audio_tensor[None], 
                              device='cuda' if torch.cuda.is_available() else 'cpu')
        
        # Get all stems except vocals
        vocal_idx = separator.sources.index('vocals')
        instrumental_stems = [stems[0, i] for i in range(len(separator.sources)) if i != vocal_idx]
        
        # Mix all non-vocal stems
        instrumental = torch.stack(instrumental_stems).sum(0)
        
        return instrumental, separator.samplerate
        
    except Exception as e:
        print(f"Error in vocal removal: {e}")
        return None, None

# Process audio files
def process_audio_files(raw_dataset_path, save_path, separator):
    if not os.path.exists(raw_dataset_path):
        print(f"Error: Raw dataset path {raw_dataset_path} does not exist")
        return
    
    for filename in tqdm(os.listdir(raw_dataset_path)):
        if filename.endswith(('.mp3', '.wav', '.flac', '.mp4')):
            file_path = os.path.join(raw_dataset_path, filename)
            
            try:
                # Handle MP4 conversion
                if filename.endswith('.mp4'):
                    fname = convert_mp4_to_wav(file_path)
                else:
                    fname = filename
                
                # Load audio
                audio_path = os.path.join(raw_dataset_path, fname)
                audio = AudioSegment.from_file(audio_path)
                
                # Convert to tensor for vocal removal
                if audio.channels == 1:
                    audio = audio.set_channels(2)
                
                samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
                samples = samples.reshape((-1, 2)).T
                max_val = 2**(audio.sample_width * 8 - 1)
                samples = samples / max_val
                audio_tensor = torch.from_numpy(samples)
                
                # Remove vocals
                instrumental, out_sr = remove_vocals(audio_tensor, separator, audio.frame_rate)
                
                # Save instrumental track
                output_filename = os.path.join(save_path, f"{os.path.splitext(fname)[0]}_instrumental.wav")
                if instrumental is not None:
                    torchaudio.save(output_filename, instrumental.cpu(), out_sr)
                    print(f'Instrumental saved: {output_filename}')
                else:
                    print(f'Vocal removal failed for {fname}, skipping...')
                
            except Exception as e:
                print(f"Error processing {filename}: {e}")
                continue

# Main execution
os.makedirs(save_path, exist_ok=True)
separator = initialize_separator()

if separator is None:
    print("Error: Could not initialize vocal separator. Exiting.")
else:
    process_audio_files(raw_dataset_path, save_path, separator)
    print("Processing complete!")

  2%|▏         | 1/50 [00:06<05:00,  6.13s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/David Bowie - God Knows I'm Good - 2015 Remaster_instrumental.wav


  4%|▍         | 2/50 [00:09<03:30,  4.39s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Camper Van Beethoven - Take the Skinheads Bowling_instrumental.wav


  6%|▌         | 3/50 [00:16<04:21,  5.55s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Animal Collective - My Girls_instrumental.wav


  8%|▊         | 4/50 [00:22<04:31,  5.89s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Bon Iver - Holocene_instrumental.wav


 10%|█         | 5/50 [00:27<04:07,  5.50s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Brandtson - Words For You_instrumental.wav


 12%|█▏        | 6/50 [00:33<04:13,  5.77s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/A Thousand Details - Skulls and Gulls - Space Modular Remix_instrumental.wav


 14%|█▍        | 7/50 [00:38<03:50,  5.35s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Dashboard Confessional - Screaming Infidelities_instrumental.wav


 16%|█▌        | 8/50 [00:44<03:50,  5.49s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Dismemberment Plan - A Life of Possibilities_instrumental.wav


 18%|█▊        | 9/50 [00:47<03:19,  4.86s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/David Bowie - Rebel Rebel - Live; 2005 Mix; 2016 Remaster_instrumental.wav


 20%|██        | 10/50 [00:51<02:59,  4.50s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Belle and Sebastian - Sukie in the Graveyard_instrumental.wav


 22%|██▏       | 11/50 [00:55<02:58,  4.57s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Christie Front Drive - Radio_instrumental.wav


 24%|██▍       | 12/50 [00:59<02:47,  4.42s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/David Bowie - Suffragette City - Sounds Of The 70s- John Peel - recorded 16th May 1972_instrumental.wav


 26%|██▌       | 13/50 [01:06<03:03,  4.95s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Band of Horses - The Funeral - Live Acoustic_instrumental.wav


 28%|██▊       | 14/50 [01:09<02:39,  4.44s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Archers Of Loaf - Freezing Point - Vs. The Greatest of All Time_instrumental.wav


 30%|███       | 15/50 [01:14<02:42,  4.65s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Arctic Monkeys - 505_instrumental.wav


 32%|███▏      | 16/50 [01:18<02:32,  4.50s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/David Bowie - Ziggy Stardust - Sounds Of The 70s- Bob Harris - recorded 18th January 1972_instrumental.wav


 34%|███▍      | 17/50 [01:22<02:23,  4.35s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Aztec Camera - Oblivious_instrumental.wav


 36%|███▌      | 18/50 [01:28<02:34,  4.83s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Butthole Surfers - Pepper_instrumental.wav


 38%|███▊      | 19/50 [01:35<02:45,  5.32s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Arcade Fire - Sprawl II (Mountains Beyond Mountains)_instrumental.wav


 40%|████      | 20/50 [01:42<02:54,  5.80s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Arcade Fire - No Cars Go_instrumental.wav


 42%|████▏     | 21/50 [01:47<02:45,  5.69s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/American Football - Never Meant_instrumental.wav


 44%|████▍     | 22/50 [01:55<02:57,  6.32s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Battles - Atlas_instrumental.wav


 46%|████▌     | 23/50 [02:00<02:38,  5.86s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Braid - The New Nathan Detroits_instrumental.wav


 48%|████▊     | 24/50 [02:05<02:30,  5.80s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/David Bowie - Life On Mars - Live, Glastonbury, 2000_instrumental.wav


 50%|█████     | 25/50 [02:08<02:01,  4.85s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Archers Of Loaf - Web in Front_instrumental.wav


 52%|█████▏    | 26/50 [02:14<02:02,  5.10s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Don Caballero - In the Absence of Strong Evidence to the Contrary, One May Step Out of the Way of the Charging Bull_instrumental.wav


 54%|█████▍    | 27/50 [02:23<02:28,  6.45s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/David Bowie, Jeff Beck - Medley- The Jean Genie  Love Me Do  The Jean Genie (Live) [feat. Jeff Beck]_instrumental.wav


 56%|█████▌    | 28/50 [02:27<02:05,  5.72s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Dinosaur Jr. - Freak Scene - Live_instrumental.wav


 58%|█████▊    | 29/50 [02:35<02:10,  6.22s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Big Black - Kerosene_instrumental.wav


 60%|██████    | 30/50 [02:41<02:08,  6.44s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Broken Social Scene - Cause = Time_instrumental.wav


 62%|██████▏   | 31/50 [02:47<01:57,  6.20s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Cut Copy - Lights & Music_instrumental.wav


 64%|██████▍   | 32/50 [02:54<01:56,  6.49s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/A Thousand Details, Space Modular - Skulls and Gulls - Space Modular Remix_instrumental.wav


 66%|██████▌   | 33/50 [03:00<01:45,  6.22s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Black Rebel Motorcycle Club - Whatever Happened To My Rock 'N' Roll (Punk Song)_instrumental.wav


 68%|██████▊   | 34/50 [03:06<01:39,  6.22s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Arcade Fire - Rebellion (Lies)_instrumental.wav


 70%|███████   | 35/50 [03:09<01:17,  5.14s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Cap'n Jazz - Oh Messy Life_instrumental.wav


 72%|███████▏  | 36/50 [03:14<01:10,  5.04s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Arcade Fire - Month of May_instrumental.wav


 74%|███████▍  | 37/50 [03:17<01:00,  4.63s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Chavez - Break Up Your Band - 2020 Remaster_instrumental.wav


 76%|███████▌  | 38/50 [03:20<00:48,  4.03s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Belle and Sebastian - A Summer Wasting_instrumental.wav


 78%|███████▊  | 39/50 [03:26<00:52,  4.81s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Arcade Fire - Wake Up_instrumental.wav


 80%|████████  | 40/50 [03:33<00:52,  5.28s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Belle and Sebastian - The Boy With the Arab Strap_instrumental.wav


 82%|████████▏ | 41/50 [03:37<00:43,  4.81s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/David Bowie - Modern Love - Moonage Daydream Mix_instrumental.wav


 84%|████████▍ | 42/50 [03:43<00:43,  5.40s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Built To Spill - Carry the Zero_instrumental.wav


 86%|████████▌ | 43/50 [03:48<00:36,  5.19s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Death Cab for Cutie - Soul Meets Body_instrumental.wav


 88%|████████▊ | 44/50 [03:53<00:30,  5.08s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Bon Iver - Skinny Love_instrumental.wav


 90%|█████████ | 45/50 [03:58<00:25,  5.20s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Adorable - Homeboy_instrumental.wav


 92%|█████████▏| 46/50 [04:02<00:19,  4.76s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Art Brut - Formed a Band_instrumental.wav


 94%|█████████▍| 47/50 [04:05<00:12,  4.12s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Dirty Pretty Things - Bang Bang You're Dead_instrumental.wav


 96%|█████████▌| 48/50 [04:08<00:07,  3.96s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Arctic Monkeys - I Bet You Look Good On The Dancefloor_instrumental.wav


 98%|█████████▊| 49/50 [04:12<00:03,  3.97s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/Babyshambles - Killamangiro_instrumental.wav


100%|██████████| 50/50 [04:17<00:00,  5.14s/it]

Instrumental saved: /home/ubuntu/indie-rock_instrumentals/David Bowie - Fashion - Single Version; 2017 Remaster_instrumental.wav
Processing complete!





In [5]:
#@title **<font color='orange'>DEFINE FUNCTIONS AND STUFF</font>**
#@title Prepare Tags & Labels

ROOT_DIR = '/home/ubuntu'

AUDIOCRAFT_DIR = '/home/ubuntu/audiocraft'

%cd {AUDIOCRAFT_DIR}

DORA_DIR = f'{ROOT_DIR}/dora'

CONFIG_DIR = f'{AUDIOCRAFT_DIR}/egs/music'

CONFIG_DIR_TRAIN = f'{CONFIG_DIR}/train'

CONFIG_FILE_PATH_TRAIN = f'"{CONFIG_DIR_TRAIN}/data.jsonl"'

DATASET_DIR = f'{ROOT_DIR}/dataset'

DATASET_DIR_TRAIN = f'"{DATASET_DIR}/train"'

%mkdir -pv {DATASET_DIR_TRAIN}

%mkdir -pv {CONFIG_DIR_TRAIN}

%mkdir -pv {DORA_DIR}

# Derived from https://github.com/sakemin/cog-musicgen-fine-tuner/blob/main/train.py

import json
import logging
import os
import shutil
import subprocess
import sys
import tempfile
import time
import warnings

import librosa

import audiocraft.data.audio_dataset as audio_dataset
import numpy as np
import subprocess as sp

from datetime import datetime
from pathlib import Path
from pydub import AudioSegment
from tqdm import tqdm

#######################################################################################################################

#os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #'0,1,2,3,4,5,6,7'

logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)
logging.getLogger("py4j").setLevel(logging.WARNING)
logging.getLogger("sh.command").setLevel(logging.ERROR)

#######################################################################################################################

def get_timestamp():
    current_time = datetime.now().strftime('%y%m%d%H%M%S')
    return current_time

def do_decompress(dataset_path, target_path):
    dataset_path = str(dataset_path)
    target_path = str(target_path)

    if str(dataset_path).rsplit('.', 1)[1] == 'zip':
        subprocess.run(['unzip', str(dataset_path), '-d', target_path + '/'])
    elif str(dataset_path).rsplit('.', 1)[1] == 'tar':
        subprocess.run(['tar', '-xvf', str(dataset_path), '-C', target_path + '/'])
    elif str(dataset_path).rsplit('.', 1)[1] == 'gz':
        subprocess.run(['tar', '-xvzf', str(dataset_path), '-C', target_path + '/'])
    elif str(dataset_path).rsplit('.', 1)[1] == 'tgz':
        subprocess.run(['tar', '-xzvf', str(dataset_path), '-C', target_path + '/'])

    elif str(dataset_path).rsplit('.', 1)[1] in ['wav', 'mp3', 'flac', 'mp4']:
        shutil.move(str(dataset_path), target_path + '/' + str(dataset_path.name))
    else:
        raise Exception("Not supported compression file type. The file type should be one of 'zip', 'tar', 'tar.gz', 'tgz' types of compression file, or a single 'wav', 'mp3', 'flac', 'mp4' types of audio file.")

    # Removing __MACOSX and .DS_Store
    if (Path(target_path)/"__MACOSX").is_dir():
        shutil.rmtree(target_path+"/__MACOSX")
    elif (Path(target_path)/"__MACOSX").is_file():
        os.remove(target_path+"/__MACOSX")
    if (Path(target_path)/".DS_Store").is_dir():
        shutil.rmtree(target_path+"/.DS_Store")
    elif (Path(target_path)/".DS_Store").is_file():
        os.remove(target_path+"/.DS_Store")

def convert_mp4_to_wav(mp4_path, wav_path):
    mp4_path = Path(mp4_path)
    wav_path = Path(wav_path)

    wav_path.mkdir(parents=True, exist_ok=True)

    if (mp4_path.suffix=='.mp4'):
        import moviepy
        video = moviepy.editor.VideoFileClip(mp4_path)
        wav_path = mp4_path.with_suffix('.wav')
        video.audio.write_audiofile(wav_path)
        print(f'mp4 file at {mp4_path} converted to wav file at {wav_path}')

def get_audio_features(audio_path, description=''):
    audio_features = {}
    audio_features['description'] = 'An indie-rock music piece with energetic guitar riffs, upbeat rhythms, and expressive melodies.'
    audio_features['tempo'] = ''
    audio_features['genre'] = 'indie rock'
    audio_features['key'] = ''
    audio_features['bpm'] = ''
    audio_features['moods'] = []
    audio_features['instrument'] = ''

    return audio_features

def prepare_data(
                dataset_path: str = 'dataset',
                meta_path: str = 'audiocraft/egs/music',
                dataset_splits = ['eval', 'valid', 'train'],
                anchor_term: str = None,
                device: str = 'cuda',
                channels: int = 2):

    dataset_path = Path(dataset_path)
    target_path  = Path(dataset_path)
    meta_path    = Path(meta_path)

    for split in dataset_splits:
        dataset_split_path = dataset_path / split
        meta_split_path = meta_path / split

        dataset_split_path.mkdir(parents=True, exist_ok=True)

        meta_split_path.mkdir(parents=True, exist_ok=True)

        meta_split_file_path = meta_split_path / 'data.jsonl'

        print(f'{dataset_split_path=}')
        audio_metas = audio_dataset.find_audio_files(
                                                        dataset_split_path,
                                                        audio_dataset.DEFAULT_EXTS,
                                                        progress=True,
                                                        resolve=True,
                                                        minimal=False,
                                                        workers=2
                                                        )

        print(f'Creating file-based metas ... ')
        if len(audio_metas)==0:
            raise ValueError("No audio file detected. Are you sure the audio file is longer than 5 seconds?")

        audio_dataset.save_audio_meta(meta_split_file_path, audio_metas)

        dataset_split_len = 0

        max_sample_rate = 0

        print()

        for audio_meta in tqdm(audio_metas, "Getting audio features"):

            audio_path = Path(audio_meta.path)

            if anchor_term is not None:
                description = anchor_term
            else:
                if audio_path.with_suffix('.txt').exists():
                    description = audio_path.with_suffix('.txt').read_text()

            audio_features_dict = get_audio_features(audio_path, description)

            audio_meta_dict = {
                "artist": '',
                "sample_rate": audio_meta.sample_rate,
                "file_extension": audio_path.suffix,
                "description": description,
                "keywords": "",
                "duration": audio_meta.duration,
                "title": audio_path.stem,
                "name": "",
                "path": str(audio_path),
                **audio_features_dict
            }

            print(f'{audio_meta_dict=}')

            audio_path.with_suffix('.json').write_text(json.dumps(audio_meta_dict))

            dataset_split_len += 1

    return dataset_split_len


/home/ubuntu/audiocraft
mkdir: created directory '/home/ubuntu/dataset'
mkdir: created directory '/home/ubuntu/dataset/train'
mkdir: created directory '/home/ubuntu/audiocraft/egs/music'
mkdir: created directory '/home/ubuntu/audiocraft/egs/music/train'


2025-07-31 10:32:45.054284: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-31 10:32:45.056936: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-31 10:32:45.111844: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
DATA_SRC_DIR = '/home/ubuntu/indie-rock_instrumentals' #@param {type: 'string'}

DATA_SRC_DIR = f'"{DATA_SRC_DIR}"'

USE_DIR = True #@param {type: 'boolean'}

if USE_DIR:
    # Patience required for large datasets. No clobber set in options
    %cp -ruv {DATA_SRC_DIR} {DATASET_DIR_TRAIN}
    print(f'ls {DATASET_DIR_TRAIN}')
    %ls {DATASET_DIR_TRAIN}

##################################################################


print(f'{ROOT_DIR=}')
print(f'{AUDIOCRAFT_DIR=}')

print(f'{DATA_SRC_DIR=}')

print(f'{DATASET_DIR=}')
print(f'{DATASET_DIR_TRAIN=}')

print(f'{CONFIG_DIR=}')
print(f'{CONFIG_DIR_TRAIN=}')
print(f'{CONFIG_FILE_PATH_TRAIN=}')

'/home/ubuntu/indie-rock_instrumentals' -> '/home/ubuntu/dataset/train/indie-rock_instrumentals'
"/home/ubuntu/indie-rock_instrumentals/David Bowie - God Knows I'm Good - 2015 Remaster_instrumental.wav" -> "/home/ubuntu/dataset/train/indie-rock_instrumentals/David Bowie - God Knows I'm Good - 2015 Remaster_instrumental.wav"
'/home/ubuntu/indie-rock_instrumentals/Camper Van Beethoven - Take the Skinheads Bowling_instrumental.wav' -> '/home/ubuntu/dataset/train/indie-rock_instrumentals/Camper Van Beethoven - Take the Skinheads Bowling_instrumental.wav'
'/home/ubuntu/indie-rock_instrumentals/Animal Collective - My Girls_instrumental.wav' -> '/home/ubuntu/dataset/train/indie-rock_instrumentals/Animal Collective - My Girls_instrumental.wav'
'/home/ubuntu/indie-rock_instrumentals/Bon Iver - Holocene_instrumental.wav' -> '/home/ubuntu/dataset/train/indie-rock_instrumentals/Bon Iver - Holocene_instrumental.wav'
'/home/ubuntu/indie-rock_instrumentals/Brandtson - Words For You_instrumental.wav' 

In [8]:
#@title **<font color='orange'>DO THE DATASET CREATION</font>**
#@markdown ---
#@markdown <font color='orange'>INSTRUCTIONS</font>

#@markdown This is where the `instructions` go

#@markdown ---

##################################################################
# Put this in a function or something later
# Write config file named finetune.yaml
# /content/audiocraft/config/dset/music

%cd {AUDIOCRAFT_DIR}

import os

CONFIG_DSET = f'{AUDIOCRAFT_DIR}/config/dset/music'

%mkdir -pv {CONFIG_DSET}

config_path = os.path.join(CONFIG_DSET, "finetune.yaml")

# We're fine-tuning. Not much point in using separate train/valid/eval dirs
package_str = "package"
yaml_contents = f"""#@{package_str} __global__
datasource:
  max_channels: 2
  max_sample_rate: 48000
  num_workers: 4

  evaluate: egs/music/train
  generate: egs/music/train
  valid: egs/music/train
  train: egs/music/train
"""

with open(config_path, 'w') as yaml_file:
    yaml_file.write(yaml_contents)

print(f'{config_path=}')
print(f'Config file created.')

##################################################################

%cd {AUDIOCRAFT_DIR}

ANCHOR_TERM = "" #@param {type: 'string'}

INPUT_STEREO = True #@param {type: 'boolean'}

channels = 2 if INPUT_STEREO else 1

len_dataset = prepare_data(
                            dataset_path=Path(DATASET_DIR),
                            meta_path=Path(CONFIG_DIR),
                            anchor_term=ANCHOR_TERM,
                            dataset_splits=['train'],
                            device='cuda',
                            channels=channels
                            )

print(f'{len_dataset=}')

/home/ubuntu/audiocraft
mkdir: created directory '/home/ubuntu/audiocraft/config/dset/music'
config_path='/home/ubuntu/audiocraft/config/dset/music/finetune.yaml'
Config file created.
/home/ubuntu/audiocraft
dataset_split_path=PosixPath('/home/ubuntu/dataset/train')
Finding audio files...
Getting audio metadata...


 100.0%0

Creating file-based metas ... 



Getting audio features: 100%|██████████| 50/50 [00:00<00:00, 1976.36it/s]

audio_meta_dict={'artist': '', 'sample_rate': 44100, 'file_extension': '.wav', 'description': 'An indie-rock music piece with energetic guitar riffs, upbeat rhythms, and expressive melodies.', 'keywords': '', 'duration': 362.17179138321995, 'title': 'A Thousand Details - Skulls and Gulls - Space Modular Remix_instrumental', 'name': '', 'path': '/home/ubuntu/dataset/train/indie-rock_instrumentals/A Thousand Details - Skulls and Gulls - Space Modular Remix_instrumental.wav', 'tempo': '', 'genre': 'indie rock', 'key': '', 'bpm': '', 'moods': [], 'instrument': ''}
audio_meta_dict={'artist': '', 'sample_rate': 44100, 'file_extension': '.wav', 'description': 'An indie-rock music piece with energetic guitar riffs, upbeat rhythms, and expressive melodies.', 'keywords': '', 'duration': 362.15074829931973, 'title': 'A Thousand Details, Space Modular - Skulls and Gulls - Space Modular Remix_instrumental', 'name': '', 'path': '/home/ubuntu/dataset/train/indie-rock_instrumentals/A Thousand Details,




In [1]:
#@title **<font color='orange'>SET THE CONFIGURATION</font>**

#@markdown ---

#@markdown <font color='orange'>INSTRUCTIONS</font>

#@markdown This is where the `instructions` go

AUDIOCRAFT_DIR = '/home/ubuntu/audiocraft'

%cd {AUDIOCRAFT_DIR}

%env AUDIOCRAFT_DORA_DIR=/home/ubuntu/dora

#%env AUDIOCRAFT_REFERENCE_DIR=???

ROOT_DIR = f'/home/ubuntu'

DORA_DIR = f'{ROOT_DIR}/dora'

AUDIOCRAFT_DIR = f'{ROOT_DIR}/audiocraft'

#DATASET_DIR = 'dataset/music/'  #@param {type: 'string'}

DATASET_DIR = f'{ROOT_DIR}/dataset'

DATASET_DIR_TRAIN = f'{DATASET_DIR}/train'

#CONFIG_DIR = f'{ROOT_DIR}/meta'
CONFIG_DIR = f'{AUDIOCRAFT_DIR}/egs/music'

CONFIG_DIR_TRAIN = f'{CONFIG_DIR}/train'

CONFIG_FILE_PATH_TRAIN = f'{CONFIG_DIR_TRAIN}/data.jsonl'

#@markdown ---
#@markdown #####<font color='orange'>SETTINGS</font>

#@markdown ---
EXECUTE_ONLY                        = 'null' #['generate','evaluate,'valid']

AUTOCAST                            = 'true' # Set true for Colab
AUTOCAST_DTYPE                      = 'bfloat16'  #@param {type:'string'} ['float16','float32','bfloat16']

CHANNELS                            = 2  #@param {type:'string'} [2,1]

CONDITIONER                         = 'text2music'  #@param {type:'string'} ['text2music', 'chroma2music']

CONTINUE_FROM                       = 'CUSTOM' #@param {type:'string'} ['//pretrained/facebook/musicgen-stereo-small', '//pretrained/facebook/musicgen-stereo-medium', '//pretrained/facebook/musicgen-small', '//pretrained/facebook/musicgen-medium', 'CUSTOM']
CONTINUE_FROM_CUSTOM                = '/home/ubuntu/dora/xps/Spin-trak-classical/checkpoint_5.th' #@param {type:'string'}
#CONTINUE_FROM                      = f'/content/drive/MyDrive/dora/xps/9566a935/checkpoint.th' #<- e.g. finetune chkpt
#CONTINUE_FROM                      = f'//SIG/{sig}'

DEVICE                              = 'cuda'  #@param {type:'string'} ['cuda','cpu']
DTYPE                               = 'float32'  #@param {type:'string'} ['float16','float32', 'bfloat16']
DSET                                = 'music/finetune' # This is some AC strangeness. Represents a piece of a Path

EFFICIENT_ATTENTION_BACKEND         = 'xformers'  #@param {type: 'string'} ['torch','xformers']
FSDP_USE                            = 'false'  # Not for Colab
LABEL                               = 'My_Label'
##@param {type:'string'}
MODEL_SCALE                         = 'large'  #@param {type:'string'} ['small','medium','large']
MP_START_METHOD                     = 'spawn'  #@param {type:'string'} ['fork','forkserver','spawn']
NUM_THREADS                         = 2  #@param {type:'number'}
SEED                                = 1123581  #@param {type:'number'}
SHOW_MODEL                          = 'false'  #@param {type:'string'} ['true','false']
SOLVER                              = 'musicgen/musicgen_base_32khz'  #@param {type:'string'} ['musicgen/musicgen_base_32khz', 'musicgen/musicgen_melody_32khz']
#@markdown ---
#@markdown #####<font color='orange'>CHECKPOINT</font>
#@markdown ---
CHECKPOINT_KEEP_LAST                = 5  #@param {type:'number'}
CHECKPOINT_SAVE_EVERY               = 1  #@param {type:'number'}
CHECKPOINT_SAVE_LAST                = 'false'  #@param {type:'string'} ['true','false']
#@markdown ---
#@markdown #####<font color='orange'>DATASET</font>
#@markdown ---
DATASET_BATCH_SIZE                  = 2  #@param {type:'number'}
DATASET_EVALUATE_NUM_SAMPLES        = 1  #@param {type:'number'}
DATASET_GENERATE_NUM_SAMPLES        = 6  #@param {type:'number'}
DATASET_GENERATE_RETURN_INFO        = 'true'  #@param {type:'string'} ['true','false']
DATASET_MIN_SEGMENT_RATIO           = 0.90  #@param {type:'number'}
DATASET_NUM_WORKERS                 = 0  #@param {type:'number'}
DATASET_SAMPLE_ON_DURATION          = 'true'  #@param {type:'string'} ['true','false']
DATASET_SAMPLE_ON_WEIGHT            = 'true'  #@param {type:'string'} ['true','false']
DATASET_SEGMENT_DURATION            = 30  #@param {type:'number'}
DATASET_SHUFFLE                     = 'true'  #@param {type:'string'} ['true','false']
DATASET_TRAIN_DROP_DESC_P           = 0.5  #@param {type:'number'}
DATASET_TRAIN_DROP_OTHER_P          = 1.0  #@param {type:'number'}
DATASET_TRAIN_MERGE_TEXT_P          = 0.0  #@param {type:'number'}
DATASET_TRAIN_NUM_SAMPLES           = 10_000_000  #@param {type:'number'}
DATASET_TRAIN_PERMUTATION_ON_FILES  = 'false'  #@param {type:'string'} ['true','false']
DATASET_TRAIN_SHUFFLE               = 'true'  #@param {type:'string'} ['true','false']
DATASET_TRAIN_SHUFFLE_SEED          = 112358  #@param {type:'number'}
DATASET_VALID_NUM_SAMPLES           = 6  #@param {type:'number'}
#@markdown ---
#@markdown #####<font color='orange'>STEREO</font>
#@markdown ---
INTERLEAVE_STEREO_CODEBOOKS_USE     = 'true'  #@param {type:'string'} ['true','false']
CODEBOOKS_PATTERN_DELAY_DELAYS      = '0, 0, 1, 1, 2, 2, 3, 3'  #@param {type:'string'} ['0, 0, 1, 1, 2, 2, 3, 3']
#@markdown ---
#@markdown #####<font color='orange'>LOGGING</font>
#@markdown ---
LOGGING_LEVEL                       = 'INFO'  #@param {type:'string'} ['INFO','ERROR']
LOGGING_LOG_UPDATES                 = 10  #@param {type:'number'}
LOGGING_LOG_TENSORBOARD             = 'false'  #@param {type:'string'} ['true','false']
LOGGING_LOG_WANDB                   = 'false'  #@param {type:'string'} ['true','false']
#@markdown ---
#@markdown #####<font color='orange'>EVALUATE SAMPLES</font>
#@markdown ---
EVALUATE_EVERY                      = 100  #@param {type:'number'}
#@markdown ---
#@markdown #####<font color='orange'>GENERATE SAMPLES</font>
#@markdown ---
GENERATE_EVERY                      = 1  #@param {type:'number'}
#@markdown ---
#@markdown #####<font color='orange'>OPTIMIZER</font>
#@markdown ---
OPTIM_ADAM_WEIGHT_DECAY             = 1e-3  #@param {type:'number'}
OPTIM_EPOCHS                        = 5  #@param {type:'number'}
OPTIM_LR                            = 1e-6  #@param {type:'number'}
OPTIM_UPDATES_PER_EPOCH             = 1000  #@param {type:'number'}
OPTIM_OPTIMIZER                     = 'adamw'  #@param {type:'string'} ['adamw','dadam']
#@markdown ---
#@markdown #####<font color='orange'>SCHEDULE</font>
#@markdown ---
SCHEDULE_COSINE_WARMUP              = 0  #@param {type:'number'}
SCHEDULE_LR_SCHEDULER               = 'cosine'  #@param {type:'string'} ['cosine']
#@markdown ---
#@markdown #####<font color='orange'>TENSORBOARD</font>
#@markdown ---
TENSORBOARD_WITH_MEDIA_LOGGING      = 'false'  #@param {type:'string'} ['true', 'false']
TENSORBOARD_NAME                    = 'AudioCraftResearch'  #@param {type:'string'}
TENSORBOARD_SUB_DIR                 = '/home/ubuntu/tb'  #@param {type:'string'}
#@markdown ---
#@markdown #####<font color='orange'>TRANSFORMER</font>
#@markdown ---
TRANSFORMER_LM_N_Q                  = 8  #@param {type:'number'}
TRANSFORMER_LM_CARD                 = 2048  #@param {type:'number'}
#@markdown ---
#@markdown #####<font color='orange'>WANDB</font>
#@markdown ---
WANDB_WITH_MEDIA_LOGGING            = 'false'  #@param {type:'string'} ['true', 'false']
WANDB_GROUP                         = 'AudioCraftWerk'  #@param {type:'string'}
WANDB_NAME                          = 'FineTunes'  #@param {type:'string'}
WANDB_PROJECT                       = 'AudioCraftFineTunes'  #@param {type:'string'}
#@markdown ---

if CONTINUE_FROM == 'CUSTOM':
    CONTINUE_FROM = CONTINUE_FROM_CUSTOM

print()
print(f'---   SETTINGS   ---')
print()
print()
print(f'{ROOT_DIR=}')
print()
print(f'{DATASET_DIR=}')
print(f'{DATASET_DIR_TRAIN=}')
print()
print(f'{CONFIG_DIR=}')
print(f'{CONFIG_DIR_TRAIN=}')
print(f'{CONFIG_FILE_PATH_TRAIN=}')
print()
print(f'{EXECUTE_ONLY=}')
print()
print(f'{AUTOCAST=}')
print(f'{AUTOCAST_DTYPE=}')
print()
print(f'{CHANNELS=}')
print()
print(f'{CODEBOOKS_PATTERN_DELAY_DELAYS=}')
print()
print(f'{CONDITIONER=}')
print()
print(f'{CONTINUE_FROM=}')
print()
print(f'{DEVICE=}')
print(f'{DTYPE=}')
print(f'{DSET=}')
print(f'{EXECUTE_ONLY=}')
print(f'{EFFICIENT_ATTENTION_BACKEND=}')
print(f'{FSDP_USE=}')
print(f'{LABEL=}')
print(f'{MODEL_SCALE=}')
print(f'{NUM_THREADS=}')
print(f'{SEED=}')
print(f'{SHOW_MODEL=}')
print()
print(f'{SOLVER=}')
print()
print(f'{CHECKPOINT_KEEP_LAST=}')
print(f'{CHECKPOINT_SAVE_EVERY=}')
print(f'{CHECKPOINT_SAVE_LAST=}')
print()
print(f'{DATASET_BATCH_SIZE=}')
print(f'{DATASET_EVALUATE_NUM_SAMPLES=}')
print(f'{DATASET_GENERATE_NUM_SAMPLES=}')
print(f'{DATASET_GENERATE_RETURN_INFO=}')
print(f'{DATASET_MIN_SEGMENT_RATIO=}')
print(f'{DATASET_NUM_WORKERS=}')
print(f'{DATASET_SAMPLE_ON_DURATION=}')
print(f'{DATASET_SAMPLE_ON_WEIGHT=}')
print(f'{DATASET_SEGMENT_DURATION=}')
print(f'{DATASET_SHUFFLE=}')
print(f'{DATASET_TRAIN_DROP_DESC_P=}')
print(f'{DATASET_TRAIN_DROP_OTHER_P=}')
print(f'{DATASET_TRAIN_MERGE_TEXT_P=}')
print(f'{DATASET_TRAIN_NUM_SAMPLES=}')
print(f'{DATASET_TRAIN_PERMUTATION_ON_FILES=}')
print(f'{DATASET_TRAIN_SHUFFLE=}')
print(f'{DATASET_TRAIN_SHUFFLE_SEED=}')
print(f'{DATASET_VALID_NUM_SAMPLES=}')
print()
print(f'{EVALUATE_EVERY=}')
print()
print(f'{GENERATE_EVERY=}')
print()
print(f'{INTERLEAVE_STEREO_CODEBOOKS_USE=}')
print()
print(f'{LOGGING_LEVEL=}')
print(f'{LOGGING_LOG_UPDATES=}')
print(f'{LOGGING_LOG_TENSORBOARD=}')
print(f'{LOGGING_LOG_WANDB=}')
print()
print(f'{OPTIM_ADAM_WEIGHT_DECAY=}')
print(f'{OPTIM_EPOCHS=}')
print(f'{OPTIM_LR=}')
print(f'{OPTIM_UPDATES_PER_EPOCH=}')
print(f'{OPTIM_OPTIMIZER=}')
print()
print(f'{SCHEDULE_COSINE_WARMUP=}')
print(f'{SCHEDULE_LR_SCHEDULER=}')
print()
print(f'{TENSORBOARD_WITH_MEDIA_LOGGING=}')
print(f'{TENSORBOARD_NAME=}')
print(f'{TENSORBOARD_SUB_DIR=}')
print()
print(f'{TRANSFORMER_LM_N_Q=}')
print(f'{TRANSFORMER_LM_CARD=}')
print()
print(f'{WANDB_WITH_MEDIA_LOGGING=}')
print(f'{WANDB_GROUP=}')
print(f'{WANDB_NAME=}')
print(f'{WANDB_PROJECT=}')
print()
print(f'---   END SETTINGS   ---')

/home/ubuntu/audiocraft
env: AUDIOCRAFT_DORA_DIR=/home/ubuntu/dora

---   SETTINGS   ---


ROOT_DIR='/home/ubuntu'

DATASET_DIR='/home/ubuntu/dataset'
DATASET_DIR_TRAIN='/home/ubuntu/dataset/train'

CONFIG_DIR='/home/ubuntu/audiocraft/egs/music'
CONFIG_DIR_TRAIN='/home/ubuntu/audiocraft/egs/music/train'
CONFIG_FILE_PATH_TRAIN='/home/ubuntu/audiocraft/egs/music/train/data.jsonl'

EXECUTE_ONLY='null'

AUTOCAST='true'
AUTOCAST_DTYPE='bfloat16'

CHANNELS=2

CODEBOOKS_PATTERN_DELAY_DELAYS='0, 0, 1, 1, 2, 2, 3, 3'

CONDITIONER='text2music'

CONTINUE_FROM='/home/ubuntu/dora/xps/Spin-trak-classical/checkpoint_5.th'

DEVICE='cuda'
DTYPE='float32'
DSET='music/finetune'
EXECUTE_ONLY='null'
EFFICIENT_ATTENTION_BACKEND='xformers'
FSDP_USE='false'
LABEL='My_Label'
MODEL_SCALE='large'
NUM_THREADS=2
SEED=1123581
SHOW_MODEL='false'

SOLVER='musicgen/musicgen_base_32khz'

CHECKPOINT_KEEP_LAST=5
CHECKPOINT_SAVE_EVERY=1
CHECKPOINT_SAVE_LAST='false'

DATASET_BATCH_SIZE=2
DATASET_EVALUATE_NUM_SAMPLES=1
DATA

In [2]:
#@title **<font color='orange'>DO THE TRAINING</font>**
#@markdown ---
#@markdown <font color='orange'>INSTRUCTIONS</font>

#@markdown This is where the `instructions` go

#@markdown ---

%cd {AUDIOCRAFT_DIR}

BENCHMARK_NO_LOAD = 'false'

command = (
    f"dora -P audiocraft run"
    f" fsdp.use=false"
    f" autocast={AUTOCAST}"
    f" autocast_dtype={AUTOCAST_DTYPE}"
    f" benchmark_no_load={BENCHMARK_NO_LOAD}"
    f" solver={SOLVER}"
    f" conditioner={CONDITIONER}"
    f" continue_from={CONTINUE_FROM}"
    f" channels={CHANNELS}"
    f" device={DEVICE}"
    f" dset={DSET}"
    f" dtype={DTYPE}"
    f" efficient_attention_backend={EFFICIENT_ATTENTION_BACKEND}"
    f" label={LABEL}"
    f" model/lm/model_scale={MODEL_SCALE}"
    f" num_threads={NUM_THREADS}"
    f" seed={SEED}"
    f" show={SHOW_MODEL}"
    f" codebooks_pattern.delay.delays='[0, 0, 1, 1, 2, 2, 3, 3]'"
    f" interleave_stereo_codebooks.use={INTERLEAVE_STEREO_CODEBOOKS_USE}"
    f" transformer_lm.n_q={TRANSFORMER_LM_N_Q}"
    f" transformer_lm.card={TRANSFORMER_LM_CARD}"
    f" checkpoint.keep_last={CHECKPOINT_KEEP_LAST}"
    f" checkpoint.save_every={CHECKPOINT_SAVE_EVERY}"
    f" checkpoint.save_last={CHECKPOINT_SAVE_LAST}"
    f" dataset.batch_size={DATASET_BATCH_SIZE}"
    f" dataset.evaluate.num_samples={DATASET_EVALUATE_NUM_SAMPLES}"
    f" dataset.generate.num_samples={DATASET_GENERATE_NUM_SAMPLES}"
    f" dataset.generate.return_info={DATASET_GENERATE_RETURN_INFO}"
    f" dataset.min_segment_ratio={DATASET_MIN_SEGMENT_RATIO}"
    f" dataset.num_workers={DATASET_NUM_WORKERS}"
    f" dataset.sample_on_duration={DATASET_SAMPLE_ON_DURATION}"
    f" dataset.sample_on_weight={DATASET_SAMPLE_ON_WEIGHT}"
    f" dataset.segment_duration={DATASET_SEGMENT_DURATION}"
    f" dataset.shuffle={DATASET_SHUFFLE}"
    f" dataset.train.drop_desc_p={DATASET_TRAIN_DROP_DESC_P}"
    f" dataset.train.drop_other_p={DATASET_TRAIN_DROP_OTHER_P}"
    f" dataset.train.merge_text_p={DATASET_TRAIN_MERGE_TEXT_P}"
    f" dataset.train.num_samples={DATASET_TRAIN_NUM_SAMPLES}"
    f" dataset.train.permutation_on_files={DATASET_TRAIN_PERMUTATION_ON_FILES}"
    f" dataset.train.shuffle={DATASET_TRAIN_SHUFFLE}"
    f" dataset.train.shuffle_seed={DATASET_TRAIN_SHUFFLE_SEED}"
    f" dataset.valid.num_samples={DATASET_VALID_NUM_SAMPLES}"

    f" evaluate.every={EVALUATE_EVERY}"

    f" generate.every={GENERATE_EVERY}"

    f" generate.lm.prompted_samples=true"
    f" generate.lm.unprompted_samples=true"
    f" generate.lm.gen_gt_samples=false"

    f" logging.level={LOGGING_LEVEL}"
    f" logging.log_updates={LOGGING_LOG_UPDATES}"
    f" logging.log_tensorboard={LOGGING_LOG_TENSORBOARD}"
    f" logging.log_wandb={LOGGING_LOG_WANDB}"

    f" optim.optimizer={OPTIM_OPTIMIZER}"
    f" optim.lr={OPTIM_LR}"
    f" optim.epochs={OPTIM_EPOCHS}"
    f" optim.updates_per_epoch={OPTIM_UPDATES_PER_EPOCH}"
    f" optim.adam.weight_decay={OPTIM_ADAM_WEIGHT_DECAY}"

    f" schedule.cosine.warmup={SCHEDULE_COSINE_WARMUP}"
    f" schedule.lr_scheduler={SCHEDULE_LR_SCHEDULER}"

    f" tensorboard.name={TENSORBOARD_NAME}"
    f" tensorboard.sub_dir={TENSORBOARD_SUB_DIR}"
    f" tensorboard.with_media_logging={TENSORBOARD_WITH_MEDIA_LOGGING}"

    f" wandb.group={WANDB_GROUP}"
    f" wandb.name={WANDB_NAME}"
    f" wandb.project={WANDB_PROJECT}"
    f" wandb.with_media_logging={WANDB_WITH_MEDIA_LOGGING}"
    )

print(f'{command}')
!{command}


/home/ubuntu/audiocraft
dora -P audiocraft run fsdp.use=false autocast=true autocast_dtype=bfloat16 benchmark_no_load=false solver=musicgen/musicgen_base_32khz conditioner=text2music continue_from=/home/ubuntu/dora/xps/Spin-trak-classical/checkpoint_5.th channels=2 device=cuda dset=music/finetune dtype=float32 efficient_attention_backend=xformers label=My_Label model/lm/model_scale=large num_threads=2 seed=1123581 show=false codebooks_pattern.delay.delays='[0, 0, 1, 1, 2, 2, 3, 3]' interleave_stereo_codebooks.use=true transformer_lm.n_q=8 transformer_lm.card=2048 checkpoint.keep_last=5 checkpoint.save_every=1 checkpoint.save_last=false dataset.batch_size=2 dataset.evaluate.num_samples=1 dataset.generate.num_samples=6 dataset.generate.return_info=true dataset.min_segment_ratio=0.9 dataset.num_workers=0 dataset.sample_on_duration=true dataset.sample_on_weight=true dataset.segment_duration=30 dataset.shuffle=true dataset.train.drop_desc_p=0.5 dataset.train.drop_other_p=1.0 dataset.train.m

In [8]:
#!/usr/bin/env python3

import os
import shutil
from pathlib import Path

# Configuration
ROOT_DIR = '/home/ubuntu'
AUDIOCRAFT_DIR = f'{ROOT_DIR}/audiocraft'  # Adjust if audiocraft is in a different location

# Change to audiocraft directory
os.chdir(AUDIOCRAFT_DIR)

from audiocraft.utils import export
from audiocraft import train
import audiocraft

# Configuration parameters
DO_EXPORT = True
MUSICGEN_FINETUNES_DIR = '/homme/ubuntu/musicgen_finetunes'
SIG = 'Spin-trak-classical'  # Your model signature
EPOCH = 5
SIG_EPOCH = f'{SIG}_{EPOCH}'

# Directory paths
SIG_LOCAL_DIR = f'{ROOT_DIR}/{MUSICGEN_FINETUNES_DIR}/{SIG_EPOCH}'
CHECKPOINT_PATH = f'{ROOT_DIR}/dora/xps/{SIG}/checkpoint_{EPOCH}.th'

print(f'SIG_LOCAL_DIR = {SIG_LOCAL_DIR}')
print(f'CHECKPOINT_PATH = {CHECKPOINT_PATH}')

# Create output directory
os.makedirs(SIG_LOCAL_DIR, exist_ok=True)
print(f'Created directory: {SIG_LOCAL_DIR}')

if DO_EXPORT:
    # Check if checkpoint file exists
    if not os.path.exists(CHECKPOINT_PATH):
        print(f'ERROR: Checkpoint file not found at {CHECKPOINT_PATH}')
        exit(1)
    
    print(f'Exporting language model from {CHECKPOINT_PATH}...')
    try:
        export.export_lm(CHECKPOINT_PATH, f'{SIG_LOCAL_DIR}/state_dict.bin')
        print(f'✓ Language model exported successfully')
    except Exception as e:
        print(f'ERROR exporting language model: {e}')
        exit(1)

    print(f'Exporting encodec compression model...')
    try:
        export.export_pretrained_compression_model('facebook/encodec_32khz', f'{SIG_LOCAL_DIR}/compression_state_dict.bin')
        print(f'✓ Compression model exported successfully')
    except Exception as e:
        print(f'ERROR exporting compression model: {e}')
        exit(1)

    print(f'Export completed successfully!')
    print(f'Exported files location: {SIG_LOCAL_DIR}')
    print(f'Files created:')
    for file in os.listdir(SIG_LOCAL_DIR):
        file_path = os.path.join(SIG_LOCAL_DIR, file)
        size = os.path.getsize(file_path) / (1024*1024)  # Size in MB
        print(f'  - {file} ({size:.1f} MB)')

else:
    print(f'Export skipped (DO_EXPORT = False)')

print(f'Done.')

Dora directory: /tmp/audiocraft_ubuntu


SIG_LOCAL_DIR = /home/ubuntu//homme/ubuntu/musicgen_finetunes/Spin-trak-classical_5
CHECKPOINT_PATH = /home/ubuntu/dora/xps/Spin-trak-classical/checkpoint_5.th
Created directory: /home/ubuntu//homme/ubuntu/musicgen_finetunes/Spin-trak-classical_5
Exporting language model from /home/ubuntu/dora/xps/Spin-trak-classical/checkpoint_5.th...


KeyboardInterrupt: 

In [22]:
#!/usr/bin/env python3

import os
from pathlib import Path

# Configuration
ROOT_DIR = '/home/ubuntu'
SIG = 'Spin-trak-indie-rock'  # Your model signature

# Loading a finetune for inference:
from audiocraft.models import MusicGen

# Path to your exported model
MODEL_DESCRIPTOR = f'/home/ubuntu/homme/ubuntu/musicgen_finetunes/{SIG}'

# Alternative model options (commented out):
# MODEL_DESCRIPTOR = 'facebook/musicgen-stereo-small'
# MODEL_DESCRIPTOR = 'facebook/musicgen-stereo-medium' 
# MODEL_DESCRIPTOR = 'facebook/musicgen-stereo-large'

print(f'Loading model: {MODEL_DESCRIPTOR}')

# Check if the model directory exists
if not os.path.exists(MODEL_DESCRIPTOR):
    print(f'ERROR: Model directory not found at {MODEL_DESCRIPTOR}')
    print(f'Please make sure the export was successful and the path is correct.')
    exit(1)

# Check if required files exist
required_files = ['state_dict.bin', 'compression_state_dict.bin']
for file in required_files:
    file_path = os.path.join(MODEL_DESCRIPTOR, file)
    if not os.path.exists(file_path):
        print(f'ERROR: Required file not found: {file_path}')
        exit(1)
    else:
        size = os.path.getsize(file_path) / (1024*1024)  # Size in MB
        print(f'✓ Found {file} ({size:.1f} MB)')

print(f'Loading MusicGen model from {MODEL_DESCRIPTOR}...')

try:
    # Load the model (will use GPU if available)
    musicgen = MusicGen.get_pretrained(MODEL_DESCRIPTOR, device='cuda')
    print(f'✓ Model loaded successfully!')
    
    # Check model device properly
    try:
        if hasattr(musicgen, 'compression_model'):
            device = next(musicgen.compression_model.parameters()).device
            print(f'Model device: {device}')
        else:
            print(f'Model loaded on specified device: cuda')
    except:
        print(f'Model loaded on specified device: cuda')
    
    print(f'Model ready for inference.')
    
except Exception as e:
    print(f'ERROR loading model: {e}')
    print(f'Trying CPU fallback...')
    try:
        musicgen = MusicGen.get_pretrained(MODEL_DESCRIPTOR, device='cpu')
        print(f'✓ Model loaded on CPU successfully!')
    except Exception as e2:
        print(f'ERROR loading model on CPU: {e2}')
        exit(1)

print(f'Done.')

Loading model: /home/ubuntu/homme/ubuntu/musicgen_finetunes/Spin-trak-indie-rock
✓ Found state_dict.bin (6276.4 MB)
✓ Found compression_state_dict.bin (0.0 MB)
Loading MusicGen model from /home/ubuntu/homme/ubuntu/musicgen_finetunes/Spin-trak-indie-rock...
✓ Model loaded successfully!
Model device: cuda:0
Model ready for inference.
Done.


In [23]:
#!/usr/bin/env python3

import os
import torch
import torchaudio
import random
import numpy as np
from datetime import datetime
from pathlib import Path

# Assuming your musicgen model is already loaded
# musicgen = MusicGen.get_pretrained(MODEL_DESCRIPTOR, device='cuda')

# Configuration parameters
use_sampling = True # hardcode this
sample_rate = 32000
top_k = 250
top_p = 0
temperature = 1.0
cfg_coef = 7.0
duration = 30  # 30 seconds only
two_step_cfg = False

strategy = "loudness"  # ["loudness", "peak"]
loudness_compressor = True

seed = 15

# Generation settings
descriptions = "Lo-fi indie-rock jam with raw guitar riffs, steady drum beat, and a relaxed vibe, recorded with vintage-style production."  # Your prompt
num_tracks = 1  # Only one track
descriptions = [descriptions] * num_tracks

extend_stride=15

do_unconditional = False  # Only conditional
do_conditional = True
do_continue = False  # Disabled
do_bip_bip = False  # Disabled

# Helper functions
def get_timestamp():
    return datetime.now().strftime("%Y%m%d_%H%M%S")

def set_all_seeds(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def save_audio(audio_tensor, sample_rate, filename):
    """Save audio tensor to file"""
    output_dir = "generated_audio"
    os.makedirs(output_dir, exist_ok=True)
    
    filepath = os.path.join(output_dir, f"{filename}.wav")
    
    # Ensure audio is in the right format (channels, samples)
    if audio_tensor.dim() == 3:  # (batch, channels, samples)
        for i, audio in enumerate(audio_tensor):
            file_path = os.path.join(output_dir, f"{filename}_{i}.wav")
            torchaudio.save(file_path, audio.cpu(), sample_rate)
            print(f"Saved: {file_path}")
    else:  # (channels, samples)
        torchaudio.save(filepath, audio_tensor.cpu(), sample_rate)
        print(f"Saved: {filepath}")

def get_bip_bip(bip_duration=0.250, frequency=220, duration=1.0, sample_rate=32000):
    """Generate a simple bip bip audio prompt"""
    total_samples = int(duration * sample_rate)
    bip_samples = int(bip_duration * sample_rate)
    
    # Create silence
    audio = torch.zeros(1, total_samples)
    
    # Add two bips
    t = torch.linspace(0, bip_duration, bip_samples)
    bip = torch.sin(2 * torch.pi * frequency * t) * 0.5
    
    # First bip at 0.1s, second bip at 0.5s
    start1 = int(0.1 * sample_rate)
    start2 = int(0.5 * sample_rate)
    
    if start1 + bip_samples <= total_samples:
        audio[0, start1:start1 + bip_samples] = bip
    if start2 + bip_samples <= total_samples:
        audio[0, start2:start2 + bip_samples] = bip
    
    return audio

# Print configuration
print(f'{use_sampling=}')
print(f'{top_k=}')
print(f'{top_p=}')
print(f'{temperature=}')
print(f'{cfg_coef=}')
print(f'{duration=}')
print(f'{strategy=}')
print(f'{loudness_compressor=}')
print(f'{seed=}')
print(f'{num_tracks=}')
print(f'{descriptions=}')
print(f'{two_step_cfg=}')
print(f'{do_unconditional=}')
print(f'{do_conditional=}')
print(f'{do_continue=}')
print(f'{do_bip_bip=}')
print()

# Set random seed
seed = random.randint(0, 2**31 - 1)
set_all_seeds(seed)
print(f'Using random seed: {seed}')

# Configure the model
musicgen.set_generation_params(
    use_sampling=use_sampling,
    top_k=top_k,
    top_p=top_p,
    temperature=temperature,
    cfg_coef=cfg_coef,
    duration=duration,
    extend_stride=extend_stride,
    two_step_cfg=two_step_cfg,
)

print(f'num_tracks: {num_tracks}')
print(f'Generating ...')

label = get_timestamp()

# Only generate conditional sample
print(f'Generating conditional sample (30 seconds)...')
try:
    audios = musicgen.generate(descriptions, progress=True)
    save_audio(audios, sample_rate, f'{label}_cond')
    print(f'✓ Conditional generation completed - 30 second track generated!')
except Exception as e:
    print(f'Error in conditional generation: {e}')

print(f'Done! Check the "generated_audio" folder for your 30-second track.')
print(f'Generation completed at: {datetime.now()}')


use_sampling=True
top_k=250
top_p=0
temperature=1.0
cfg_coef=7.0
duration=30
strategy='loudness'
loudness_compressor=True
seed=15
num_tracks=1
descriptions=['Lo-fi indie-rock jam with raw guitar riffs, steady drum beat, and a relaxed vibe, recorded with vintage-style production.']
two_step_cfg=False
do_unconditional=False
do_conditional=True
do_continue=False
do_bip_bip=False

Using random seed: 334466799
num_tracks: 1
Generating ...
Generating conditional sample (30 seconds)...
Saved: generated_audio/20250801_073149_cond_0.wav
✓ Conditional generation completed - 30 second track generated!
Done! Check the "generated_audio" folder for your 30-second track.
Generation completed at: 2025-08-01 07:32:34.702755
