 #  **Load The Audio Craft**

In [None]:
!git clone https://github.com/facebookresearch/audiocraft.git
%cd audiocraft
!pip install -e .
!pip install dora-search
!pip install numba

#**Mount The Drive for Audio Files**


In [None]:

# mount google drive to access dataset
from google.colab import drive
drive.mount('/content/drive')

dataset_path = "/content/drive/MyDrive/raw"

# **Chunking audio files and removing vocals**


In [None]:
import os
import numpy as np
import torch
from pydub import AudioSegment
from tqdm import tqdm
import demucs.pretrained
import torchaudio
import moviepy.editor

# Set parameters
target_path = "/path/to/your/audio/files"  # Path to your audio files
drop_vocals = True  # Set to True to remove vocals

# Initialize Demucs model for vocal separation
if drop_vocals:
    separator = demucs.pretrained.get_model('mdx_extra').to('cuda')
else:
    separator = None

# Function to convert mp4 to wav
def convert_mp4_to_wav(filename):
    video = moviepy.editor.VideoFileClip(filename)
    fname = filename.rsplit('.', 1)[0] + '.wav'
    video.audio.write_audiofile(fname)
    print(f'Converted {filename} to {fname}')
    return fname

# Process each audio file in the target path
for filename in tqdm(os.listdir(target_path)):
    if filename.endswith(('.mp3', '.wav', '.flac', '.mp4')):

        if filename.endswith('.mp4'):
            # Convert MP4 video to WAV
            fname = convert_mp4_to_wav(os.path.join(target_path, filename))
            os.remove(os.path.join(target_path, filename))  # Remove the original video file
        else:
            fname = filename

        # Load the audio file and set frame rate to 44100
        audio = AudioSegment.from_file(os.path.join(target_path, fname))
        audio = audio.set_frame_rate(44100)  # Resample to 44100 Hz

        if len(audio) > 30000:  # Only process if audio is longer than 30 seconds
            print(f'Chunking {fname}')

            # Split audio into 30-second chunks
            for i in range(0, len(audio), 30000):
                chunk = audio[i:i + 30000]

                if len(chunk) > 5000:  # Ignore chunks that are too short (less than 5 seconds)
                    if drop_vocals and separator is not None:
                        from demucs.apply import apply_model
                        from demucs.audio import convert_audio
                        print(f'Separating vocals from {fname[:-4]}_chunk{i//1000}.wav')

                        # Split chunk into mono channels
                        channel_sounds = chunk.split_to_mono()
                        samples = [s.get_array_of_samples() for s in channel_sounds]
                        chunk_data = np.array(samples).T.astype(np.float32)
                        chunk_data /= np.iinfo(samples[0].typecode).max
                        chunk_tensor = torch.Tensor(chunk_data).T

                        # Resample and apply Demucs for vocal separation
                        chunk_tensor = convert_audio(chunk_tensor, 44100, separator.samplerate, separator.audio_channels)
                        stems = apply_model(separator, chunk_tensor[None], device='cuda')

                        # Extract instrumental stems (bass, drums, other)
                        stems = stems[:, [separator.sources.index('bass'), separator.sources.index('drums'), separator.sources.index('other')]]
                        mixed_stem = stems.sum(1)

                        # Save the mixed instrumental audio (vocals removed)
                        output_filename = f"{os.path.join(target_path, fname[:-4])}_chunk{i//1000}_mixed.wav"
                        torchaudio.save(output_filename, mixed_stem.squeeze(0), separator.samplerate)
                    else:
                        # If not dropping vocals, save the chunk as it is
                        output_filename = f"{os.path.join(target_path, fname[:-4])}_chunk{i//1000}.wav"
                        chunk.export(output_filename, format="wav")

            # Optionally, remove the original file after processing
            os.remove(os.path.join(target_path, fname))


###**Chunking audio files with Vocals (If Needed)**


In [None]:
import os
from pydub import AudioSegment

# Define the path where the chunked audio files will be saved
chunks_dir = os.path.join(dataset_path, "chunks")
os.makedirs(chunks_dir, exist_ok=True)  # Create 'chunks' directory if it doesn't exist

for filename in os.listdir(dataset_path):
    if filename.endswith(('.mp3', '.wav', '.flac')):

        # Load the original audio file
        audio = AudioSegment.from_file(os.path.join(dataset_path, filename))

        # Resample audio to 44100 Hz
        audio = audio.set_frame_rate(44100)

        # Split into 30-second chunks
        for i in range(0, len(audio), 30000):  # 30000 ms = 30 seconds
            chunk = audio[i:i+30000]

            # Save the chunked audio in the 'chunks' directory
            chunk_filename = f"{filename[:-4]}_chunk{i//1000}.wav"  # Create a unique filename for each chunk
            chunk.export(os.path.join(chunks_dir, chunk_filename), format="wav")


# **Install essentia and requirements**



In [None]:
# install essentia and requirements

# this line is needed for the tensorflow modules to install properly!!
!sudo apt-get install build-essential libeigen3-dev libyaml-dev libfftw3-dev libtag1-dev libchromaprint-dev

!pip install -U essentia-tensorflow

# Download some essentia model weights

In [None]:
# download some essentia model weights

!curl https://essentia.upf.edu/models/classification-heads/genre_discogs400/genre_discogs400-discogs-effnet-1.pb --output genre_discogs400-discogs-effnet-1.pb
!curl https://essentia.upf.edu/models/feature-extractors/discogs-effnet/discogs-effnet-bs64-1.pb --output discogs-effnet-bs64-1.pb
!curl https://essentia.upf.edu/models/classification-heads/mtg_jamendo_moodtheme/mtg_jamendo_moodtheme-discogs-effnet-1.pb --output mtg_jamendo_moodtheme-discogs-effnet-1.pb
!curl https://essentia.upf.edu/models/classification-heads/mtg_jamendo_instrument/mtg_jamendo_instrument-discogs-effnet-1.pb --output mtg_jamendo_instrument-discogs-effnet-1.pb

In [None]:
# @title metadata (labels) for essentia - LONG CELL DONT OPEN
genre_labels = [
    "Blues---Boogie Woogie",
    "Blues---Chicago Blues",
    "Blues---Country Blues",
    "Blues---Delta Blues",
    "Blues---Electric Blues",
    "Blues---Harmonica Blues",
    "Blues---Jump Blues",
    "Blues---Louisiana Blues",
    "Blues---Modern Electric Blues",
    "Blues---Piano Blues",
    "Blues---Rhythm & Blues",
    "Blues---Texas Blues",
    "Brass & Military---Brass Band",
    "Brass & Military---Marches",
    "Brass & Military---Military",
    "Children's---Educational",
    "Children's---Nursery Rhymes",
    "Children's---Story",
    "Classical---Baroque",
    "Classical---Choral",
    "Classical---Classical",
    "Classical---Contemporary",
    "Classical---Impressionist",
    "Classical---Medieval",
    "Classical---Modern",
    "Classical---Neo-Classical",
    "Classical---Neo-Romantic",
    "Classical---Opera",
    "Classical---Post-Modern",
    "Classical---Renaissance",
    "Classical---Romantic",
    "Electronic---Abstract",
    "Electronic---Acid",
    "Electronic---Acid House",
    "Electronic---Acid Jazz",
    "Electronic---Ambient",
    "Electronic---Bassline",
    "Electronic---Beatdown",
    "Electronic---Berlin-School",
    "Electronic---Big Beat",
    "Electronic---Bleep",
    "Electronic---Breakbeat",
    "Electronic---Breakcore",
    "Electronic---Breaks",
    "Electronic---Broken Beat",
    "Electronic---Chillwave",
    "Electronic---Chiptune",
    "Electronic---Dance-pop",
    "Electronic---Dark Ambient",
    "Electronic---Darkwave",
    "Electronic---Deep House",
    "Electronic---Deep Techno",
    "Electronic---Disco",
    "Electronic---Disco Polo",
    "Electronic---Donk",
    "Electronic---Downtempo",
    "Electronic---Drone",
    "Electronic---Drum n Bass",
    "Electronic---Dub",
    "Electronic---Dub Techno",
    "Electronic---Dubstep",
    "Electronic---Dungeon Synth",
    "Electronic---EBM",
    "Electronic---Electro",
    "Electronic---Electro House",
    "Electronic---Electroclash",
    "Electronic---Euro House",
    "Electronic---Euro-Disco",
    "Electronic---Eurobeat",
    "Electronic---Eurodance",
    "Electronic---Experimental",
    "Electronic---Freestyle",
    "Electronic---Future Jazz",
    "Electronic---Gabber",
    "Electronic---Garage House",
    "Electronic---Ghetto",
    "Electronic---Ghetto House",
    "Electronic---Glitch",
    "Electronic---Goa Trance",
    "Electronic---Grime",
    "Electronic---Halftime",
    "Electronic---Hands Up",
    "Electronic---Happy Hardcore",
    "Electronic---Hard House",
    "Electronic---Hard Techno",
    "Electronic---Hard Trance",
    "Electronic---Hardcore",
    "Electronic---Hardstyle",
    "Electronic---Hi NRG",
    "Electronic---Hip Hop",
    "Electronic---Hip-House",
    "Electronic---House",
    "Electronic---IDM",
    "Electronic---Illbient",
    "Electronic---Industrial",
    "Electronic---Italo House",
    "Electronic---Italo-Disco",
    "Electronic---Italodance",
    "Electronic---Jazzdance",
    "Electronic---Juke",
    "Electronic---Jumpstyle",
    "Electronic---Jungle",
    "Electronic---Latin",
    "Electronic---Leftfield",
    "Electronic---Makina",
    "Electronic---Minimal",
    "Electronic---Minimal Techno",
    "Electronic---Modern Classical",
    "Electronic---Musique Concrète",
    "Electronic---Neofolk",
    "Electronic---New Age",
    "Electronic---New Beat",
    "Electronic---New Wave",
    "Electronic---Noise",
    "Electronic---Nu-Disco",
    "Electronic---Power Electronics",
    "Electronic---Progressive Breaks",
    "Electronic---Progressive House",
    "Electronic---Progressive Trance",
    "Electronic---Psy-Trance",
    "Electronic---Rhythmic Noise",
    "Electronic---Schranz",
    "Electronic---Sound Collage",
    "Electronic---Speed Garage",
    "Electronic---Speedcore",
    "Electronic---Synth-pop",
    "Electronic---Synthwave",
    "Electronic---Tech House",
    "Electronic---Tech Trance",
    "Electronic---Techno",
    "Electronic---Trance",
    "Electronic---Tribal",
    "Electronic---Tribal House",
    "Electronic---Trip Hop",
    "Electronic---Tropical House",
    "Electronic---UK Garage",
    "Electronic---Vaporwave",
    "Folk, World, & Country---African",
    "Folk, World, & Country---Bluegrass",
    "Folk, World, & Country---Cajun",
    "Folk, World, & Country---Canzone Napoletana",
    "Folk, World, & Country---Catalan Music",
    "Folk, World, & Country---Celtic",
    "Folk, World, & Country---Country",
    "Folk, World, & Country---Fado",
    "Folk, World, & Country---Flamenco",
    "Folk, World, & Country---Folk",
    "Folk, World, & Country---Gospel",
    "Folk, World, & Country---Highlife",
    "Folk, World, & Country---Hillbilly",
    "Folk, World, & Country---Hindustani",
    "Folk, World, & Country---Honky Tonk",
    "Folk, World, & Country---Indian Classical",
    "Folk, World, & Country---Laïkó",
    "Folk, World, & Country---Nordic",
    "Folk, World, & Country---Pacific",
    "Folk, World, & Country---Polka",
    "Folk, World, & Country---Raï",
    "Folk, World, & Country---Romani",
    "Folk, World, & Country---Soukous",
    "Folk, World, & Country---Séga",
    "Folk, World, & Country---Volksmusik",
    "Folk, World, & Country---Zouk",
    "Folk, World, & Country---Éntekhno",
    "Funk / Soul---Afrobeat",
    "Funk / Soul---Boogie",
    "Funk / Soul---Contemporary R&B",
    "Funk / Soul---Disco",
    "Funk / Soul---Free Funk",
    "Funk / Soul---Funk",
    "Funk / Soul---Gospel",
    "Funk / Soul---Neo Soul",
    "Funk / Soul---New Jack Swing",
    "Funk / Soul---P.Funk",
    "Funk / Soul---Psychedelic",
    "Funk / Soul---Rhythm & Blues",
    "Funk / Soul---Soul",
    "Funk / Soul---Swingbeat",
    "Funk / Soul---UK Street Soul",
    "Hip Hop---Bass Music",
    "Hip Hop---Boom Bap",
    "Hip Hop---Bounce",
    "Hip Hop---Britcore",
    "Hip Hop---Cloud Rap",
    "Hip Hop---Conscious",
    "Hip Hop---Crunk",
    "Hip Hop---Cut-up/DJ",
    "Hip Hop---DJ Battle Tool",
    "Hip Hop---Electro",
    "Hip Hop---G-Funk",
    "Hip Hop---Gangsta",
    "Hip Hop---Grime",
    "Hip Hop---Hardcore Hip-Hop",
    "Hip Hop---Horrorcore",
    "Hip Hop---Instrumental",
    "Hip Hop---Jazzy Hip-Hop",
    "Hip Hop---Miami Bass",
    "Hip Hop---Pop Rap",
    "Hip Hop---Ragga HipHop",
    "Hip Hop---RnB/Swing",
    "Hip Hop---Screw",
    "Hip Hop---Thug Rap",
    "Hip Hop---Trap",
    "Hip Hop---Trip Hop",
    "Hip Hop---Turntablism",
    "Jazz---Afro-Cuban Jazz",
    "Jazz---Afrobeat",
    "Jazz---Avant-garde Jazz",
    "Jazz---Big Band",
    "Jazz---Bop",
    "Jazz---Bossa Nova",
    "Jazz---Contemporary Jazz",
    "Jazz---Cool Jazz",
    "Jazz---Dixieland",
    "Jazz---Easy Listening",
    "Jazz---Free Improvisation",
    "Jazz---Free Jazz",
    "Jazz---Fusion",
    "Jazz---Gypsy Jazz",
    "Jazz---Hard Bop",
    "Jazz---Jazz-Funk",
    "Jazz---Jazz-Rock",
    "Jazz---Latin Jazz",
    "Jazz---Modal",
    "Jazz---Post Bop",
    "Jazz---Ragtime",
    "Jazz---Smooth Jazz",
    "Jazz---Soul-Jazz",
    "Jazz---Space-Age",
    "Jazz---Swing",
    "Latin---Afro-Cuban",
    "Latin---Baião",
    "Latin---Batucada",
    "Latin---Beguine",
    "Latin---Bolero",
    "Latin---Boogaloo",
    "Latin---Bossanova",
    "Latin---Cha-Cha",
    "Latin---Charanga",
    "Latin---Compas",
    "Latin---Cubano",
    "Latin---Cumbia",
    "Latin---Descarga",
    "Latin---Forró",
    "Latin---Guaguancó",
    "Latin---Guajira",
    "Latin---Guaracha",
    "Latin---MPB",
    "Latin---Mambo",
    "Latin---Mariachi",
    "Latin---Merengue",
    "Latin---Norteño",
    "Latin---Nueva Cancion",
    "Latin---Pachanga",
    "Latin---Porro",
    "Latin---Ranchera",
    "Latin---Reggaeton",
    "Latin---Rumba",
    "Latin---Salsa",
    "Latin---Samba",
    "Latin---Son",
    "Latin---Son Montuno",
    "Latin---Tango",
    "Latin---Tejano",
    "Latin---Vallenato",
    "Non-Music---Audiobook",
    "Non-Music---Comedy",
    "Non-Music---Dialogue",
    "Non-Music---Education",
    "Non-Music---Field Recording",
    "Non-Music---Interview",
    "Non-Music---Monolog",
    "Non-Music---Poetry",
    "Non-Music---Political",
    "Non-Music---Promotional",
    "Non-Music---Radioplay",
    "Non-Music---Religious",
    "Non-Music---Spoken Word",
    "Pop---Ballad",
    "Pop---Bollywood",
    "Pop---Bubblegum",
    "Pop---Chanson",
    "Pop---City Pop",
    "Pop---Europop",
    "Pop---Indie Pop",
    "Pop---J-pop",
    "Pop---K-pop",
    "Pop---Kayōkyoku",
    "Pop---Light Music",
    "Pop---Music Hall",
    "Pop---Novelty",
    "Pop---Parody",
    "Pop---Schlager",
    "Pop---Vocal",
    "Reggae---Calypso",
    "Reggae---Dancehall",
    "Reggae---Dub",
    "Reggae---Lovers Rock",
    "Reggae---Ragga",
    "Reggae---Reggae",
    "Reggae---Reggae-Pop",
    "Reggae---Rocksteady",
    "Reggae---Roots Reggae",
    "Reggae---Ska",
    "Reggae---Soca",
    "Rock---AOR",
    "Rock---Acid Rock",
    "Rock---Acoustic",
    "Rock---Alternative Rock",
    "Rock---Arena Rock",
    "Rock---Art Rock",
    "Rock---Atmospheric Black Metal",
    "Rock---Avantgarde",
    "Rock---Beat",
    "Rock---Black Metal",
    "Rock---Blues Rock",
    "Rock---Brit Pop",
    "Rock---Classic Rock",
    "Rock---Coldwave",
    "Rock---Country Rock",
    "Rock---Crust",
    "Rock---Death Metal",
    "Rock---Deathcore",
    "Rock---Deathrock",
    "Rock---Depressive Black Metal",
    "Rock---Doo Wop",
    "Rock---Doom Metal",
    "Rock---Dream Pop",
    "Rock---Emo",
    "Rock---Ethereal",
    "Rock---Experimental",
    "Rock---Folk Metal",
    "Rock---Folk Rock",
    "Rock---Funeral Doom Metal",
    "Rock---Funk Metal",
    "Rock---Garage Rock",
    "Rock---Glam",
    "Rock---Goregrind",
    "Rock---Goth Rock",
    "Rock---Gothic Metal",
    "Rock---Grindcore",
    "Rock---Grunge",
    "Rock---Hard Rock",
    "Rock---Hardcore",
    "Rock---Heavy Metal",
    "Rock---Indie Rock",
    "Rock---Industrial",
    "Rock---Krautrock",
    "Rock---Lo-Fi",
    "Rock---Lounge",
    "Rock---Math Rock",
    "Rock---Melodic Death Metal",
    "Rock---Melodic Hardcore",
    "Rock---Metalcore",
    "Rock---Mod",
    "Rock---Neofolk",
    "Rock---New Wave",
    "Rock---No Wave",
    "Rock---Noise",
    "Rock---Noisecore",
    "Rock---Nu Metal",
    "Rock---Oi",
    "Rock---Parody",
    "Rock---Pop Punk",
    "Rock---Pop Rock",
    "Rock---Pornogrind",
    "Rock---Post Rock",
    "Rock---Post-Hardcore",
    "Rock---Post-Metal",
    "Rock---Post-Punk",
    "Rock---Power Metal",
    "Rock---Power Pop",
    "Rock---Power Violence",
    "Rock---Prog Rock",
    "Rock---Progressive Metal",
    "Rock---Psychedelic Rock",
    "Rock---Psychobilly",
    "Rock---Pub Rock",
    "Rock---Punk",
    "Rock---Rock & Roll",
    "Rock---Rockabilly",
    "Rock---Shoegaze",
    "Rock---Ska",
    "Rock---Sludge Metal",
    "Rock---Soft Rock",
    "Rock---Southern Rock",
    "Rock---Space Rock",
    "Rock---Speed Metal",
    "Rock---Stoner Rock",
    "Rock---Surf",
    "Rock---Symphonic Rock",
    "Rock---Technical Death Metal",
    "Rock---Thrash",
    "Rock---Twist",
    "Rock---Viking Metal",
    "Rock---Yé-Yé",
    "Stage & Screen---Musical",
    "Stage & Screen---Score",
    "Stage & Screen---Soundtrack",
    "Stage & Screen---Theme",
]
mood_theme_classes = [
    "action",
    "adventure",
    "advertising",
    "background",
    "ballad",
    "calm",
    "children",
    "christmas",
    "commercial",
    "cool",
    "corporate",
    "dark",
    "deep",
    "documentary",
    "drama",
    "dramatic",
    "dream",
    "emotional",
    "energetic",
    "epic",
    "fast",
    "film",
    "fun",
    "funny",
    "game",
    "groovy",
    "happy",
    "heavy",
    "holiday",
    "hopeful",
    "inspiring",
    "love",
    "meditative",
    "melancholic",
    "melodic",
    "motivational",
    "movie",
    "nature",
    "party",
    "positive",
    "powerful",
    "relaxing",
    "retro",
    "romantic",
    "sad",
    "sexy",
    "slow",
    "soft",
    "soundscape",
    "space",
    "sport",
    "summer",
    "trailer",
    "travel",
    "upbeat",
    "uplifting"
]
instrument_classes = [
    "accordion",
    "acousticbassguitar",
    "acousticguitar",
    "bass",
    "beat",
    "bell",
    "bongo",
    "brass",
    "cello",
    "clarinet",
    "classicalguitar",
    "computer",
    "doublebass",
    "drummachine",
    "drums",
    "electricguitar",
    "electricpiano",
    "flute",
    "guitar",
    "harmonica",
    "harp",
    "horn",
    "keyboard",
    "oboe",
    "orchestra",
    "organ",
    "pad",
    "percussion",
    "piano",
    "pipeorgan",
    "rhodes",
    "sampler",
    "saxophone",
    "strings",
    "synthesizer",
    "trombone",
    "trumpet",
    "viola",
    "violin",
    "voice"
]

In [None]:
from essentia.standard import MonoLoader, TensorflowPredictEffnetDiscogs, TensorflowPredict2D
import numpy as np

def filter_predictions(predictions, class_list, threshold=0.1):
    predictions_mean = np.mean(predictions, axis=0)
    sorted_indices = np.argsort(predictions_mean)[::-1]
    filtered_indices = [i for i in sorted_indices if predictions_mean[i] > threshold]
    filtered_labels = [class_list[i] for i in filtered_indices]
    filtered_values = [predictions_mean[i] for i in filtered_indices]
    return filtered_labels, filtered_values

def make_comma_separated_unique(tags):
    seen_tags = set()
    result = []
    for tag in ', '.join(tags).split(', '):
        if tag not in seen_tags:
            result.append(tag)
            seen_tags.add(tag)
    return ', '.join(result)

def get_audio_features(audio_filename):
    audio = MonoLoader(filename=audio_filename, sampleRate=16000, resampleQuality=4)()
    embedding_model = TensorflowPredictEffnetDiscogs(graphFilename="discogs-effnet-bs64-1.pb", output="PartitionedCall:1")
    embeddings = embedding_model(audio)

    result_dict = {}

    # predict genres
    genre_model = TensorflowPredict2D(graphFilename="genre_discogs400-discogs-effnet-1.pb", input="serving_default_model_Placeholder", output="PartitionedCall:0")
    predictions = genre_model(embeddings)
    filtered_labels, _ = filter_predictions(predictions, genre_labels)
    filtered_labels = ', '.join(filtered_labels).replace("---", ", ").split(', ')
    result_dict['genres'] = make_comma_separated_unique(filtered_labels)

    # predict mood/theme
    mood_model = TensorflowPredict2D(graphFilename="mtg_jamendo_moodtheme-discogs-effnet-1.pb")
    predictions = mood_model(embeddings)
    filtered_labels, _ = filter_predictions(predictions, mood_theme_classes, threshold=0.05)
    result_dict['moods'] = make_comma_separated_unique(filtered_labels)

    # predict instruments
    instrument_model = TensorflowPredict2D(graphFilename="mtg_jamendo_instrument-discogs-effnet-1.pb")
    predictions = instrument_model(embeddings)
    filtered_labels, _ = filter_predictions(predictions, instrument_classes)
    result_dict['instruments'] = filtered_labels

    return result_dict

In [None]:
import os
import json
import random
import librosa
import numpy as np
from pydub import AudioSegment
import wave
import re
from functools import partial
from tqdm import tqdm
from numba import cuda

# Set the following variable to True if you want to see a progress bar instead of the printed results
use_tqdm = False

# Define the dataset path
dataset_path = "/content/drive/MyDrive/raw/chunks"  # Adjust this path

# Make sure the .jsonl files have a place to go
os.makedirs("/content/audiocraft/egs/train", exist_ok=True)
os.makedirs("/content/audiocraft/egs/eval", exist_ok=True)

train_len = 0
eval_len = 0

# Open the train and eval .jsonl files for writing
with open("/content/audiocraft/egs/train/data.jsonl", "w") as train_file, \
     open("/content/audiocraft/egs/eval/data.jsonl", "w") as eval_file:

    dset = tqdm(os.listdir(dataset_path)) if use_tqdm else os.listdir(dataset_path)
    random.shuffle(dset)

    for filename in dset:
        try:
            # Get audio features (make sure result is fetched appropriately)
            result = get_audio_features(os.path.join(dataset_path, filename))

            # Get key and BPM
            y, sr = librosa.load(os.path.join(dataset_path, filename))
            if len(y) == 0:
                print(f"Warning: Empty audio data for file {filename}")
                continue  # Skip empty audio files

            # Extract tempo (BPM)
            tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
            tempo = round(tempo[0])  # Round the first element of the tempo array

            # Extract chroma features and key
            chroma = librosa.feature.chroma_stft(y=y, sr=sr)
            if chroma.shape[1] == 0:
                print(f"Warning: Chroma feature extraction failed for file {filename}")
                continue  # Skip if chroma extraction failed

            key = np.argmax(np.sum(chroma, axis=1))
            key = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'][key]

            # Extract duration
            length = librosa.get_duration(y=y, sr=sr)

            # Extract artist name from filename (modify for your dataset)
            def extract_artist_from_filename(filename):
                match = re.search(r'(.+?)\s\d+_chunk\d+\.wav', filename)
                artist = match.group(1) if match else ""
                return artist.replace("mix", "").strip() if "mix" in artist else artist

            artist_name = extract_artist_from_filename(filename)

            # Create entry for jsonl
            entry = {
                "key": f"{key}",
                "artist": artist_name,
                "sample_rate": 44100,
                "file_extension": "wav",
                "description": "",
                "keywords": "",
                "duration": length,
                "bpm": tempo,
                "genre": result.get('genres', ""),
                "title": "",
                "name": "",
                "instrument": result.get('instruments', ""),
                "moods": result.get('moods', []),
                "path": os.path.join(dataset_path, filename),
            }

            # Print the entry (can be removed for efficiency)
            print(entry)

            # Train/test split (85% train, 15% eval)
            if random.random() < 0.85:
                train_len += 1
                train_file.write(json.dumps(entry) + '\n')
            else:
                eval_len += 1
                eval_file.write(json.dumps(entry) + '\n')

        except Exception as e:
            print(f"Error extracting features from file {filename}: {e}")
            continue

print(f"Training set size: {train_len}")
print(f"Evaluation set size: {eval_len}")

# Clear CUDA memory if necessary
device = cuda.get_current_device()
device.reset()


In [None]:
# Make the .yaml

config_path = "/content/audiocraft/config/dset/audio/train.yaml"

# point to the folders that your .jsonl is in
data_path = "/content/audiocraft/egs/train"
eval_data_path = "/content/audiocraft/egs/eval"

package = "package" # yay python not letting me put #@.package in a string :/
yaml_contents = f"""#@{package} __global__

datasource:
  max_channels: 2
  max_sample_rate: 44100

  evaluate: {eval_data_path}
  generate: {data_path}
  train: {data_path}
  valid: {eval_data_path}
"""

with open(config_path, 'w') as yaml_file:
    yaml_file.write(yaml_contents)

In [None]:
%env USER=lyra
# CHANGE THIS

command = (
    "dora -P audiocraft run"
    " solver=musicgen/musicgen_base_32khz"
    " model/lm/model_scale=large"  # change to 'large' model scale
    " continue_from=//pretrained/facebook/musicgen-large"  # change to the large model
    " conditioner=text2music"
    " dset=audio/train"
    " dataset.num_workers=2"
    " dataset.valid.num_samples=1"
    " dataset.batch_size=2"  # CHANGE THIS
    " schedule.cosine.warmup=8"
    " optim.optimizer=adamw"  # uses adamw by default
    " optim.lr=1e-4"
    " optim.epochs=5"  # stops training after 5 epochs- change this
    " optim.updates_per_epoch=1000"  # 2000 by default, change this if you want checkpoints quicker ig
    " optim.adam.weight_decay=0.01"
    " generate.lm.prompted_samples=False"  # skip super long generate step
    " generate.lm.gen_gt_samples=True"
)

!{command}


In [None]:
import torch
from audiocraft.models import MusicGen
from audiocraft.utils import download_pretrained

# Define the model path where the fine-tuned model is stored
model_path = '/content/audiocraft/xps/<experiment_id>/checkpoints/model.ckpt'

# Load the fine-tuned model
model = MusicGen.from_pretrained("facebook/musicgen-large")
model.load_state_dict(torch.load(model_path))  # Load fine-tuned model

# Set the model to evaluation mode
model.eval()

# Generate music
def generate_music(prompt, num_samples=1):
    # Set up the input text or prompt for generation
    # Prompt should be related to the kind of music you'd like to generate
    inputs = model.tokenizer(prompt, return_tensors="pt")

    # Generate music based on the input prompt
    with torch.no_grad():
        generated_audio = model.generate(**inputs, num_samples=num_samples)

    # Save the generated audio to a file
    for i, audio in enumerate(generated_audio):
        audio_path = f"generated_sample_{i}.wav"
        audio.save(audio_path)
        print(f"Generated music saved to {audio_path}")

# Example prompt to generate music
prompt = "A relaxing piano piece with ambient sounds."
generate_music(prompt)
