# P.A.G.U.R.I. : Prompt Audio Generator User Research Investigation
Generate audio samples via text prompts and create your own custom music througout model fine-tuning.

In [1]:
#Insert User ID
user_id = "111111"

In [2]:
#IMPORTS & SETTINGS
#--------------------------------------------------------------------------------------------
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import io
import scipy
import torch
import IPython
import ipywidgets as widgets
import random
import os

''' For setting particular GPUs devices'''
os.environ["CUDA_VISIBLE_DEVICES"]="3,4,5"

import datetime
import pytz

from ipywidgets import*
from numba import cuda
from diffusers import DPMSolverMultistepScheduler
from dreambooth_audioldm2_notebook import train
from pipeline.pipeline_audioldm import AudioLDMPipeline
from pipeline.pipeline_audioldm2 import AudioLDM2Pipeline
from IPython.display import Audio
from IPython.display import display
from IPython.display import Javascript

import logging
logging.getLogger().setLevel(logging.CRITICAL)

#--------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------
#-------------------------------GLOBAL VARIABLES---------------------------------------
#--------------------------------------------------------------------------------------
path_experiments = "/nas/home/gperego/projects/audiocraft/UserExperience"
model_audioldm2 = "cvssp/audioldm2"
SAMPLE_RATE = 16000
#--------------------------------------------------------------------------------------
#----------------------------------FUNCTIONS-------------------------------------------
#--------------------------------------------------------------------------------------
# GPU AVAILABILITY
def gpu_availability_function():
    print(f"GPU Avaialbility : {torch.cuda.is_available()}\n"
          f"Version : {torch.version.cuda}")
    #CUDA(GPU) availability
    if torch.cuda.is_available():
        # Number of GPUs available
        num_devices = torch.cuda.device_count()
        print(f"Number of GPU available: {num_devices}")
        # GPU info
        for i in range(num_devices):
            gpu_device = torch.cuda.get_device_properties(i)
            print(f"GPU {i}: {gpu_device.name}\n -Total memory: {gpu_device.total_memory / (1024 ** 3):.2f} GB")
            allocated_memory = torch.cuda.memory_allocated(i)
            print(f" -Busy memory: {allocated_memory / 1024**3:.2f} GB")
            cached_memory = torch.cuda.memory_reserved(i)
            print(f" -Busy cache memory: {cached_memory / 1024**3:.2f} GB")
            free_memory = gpu_device.total_memory - allocated_memory - cached_memory
            print(f" -Free memory: {free_memory / 1024**3:.2f} GB")
    else:
        print("CUDA is not available. Make sure you have a GPU and PyTorch configured correctly.")
        
def restartkernel() :
    IPython.Application.instance().kernel.do_shutdown(True)
    
def norm_audio(audio):
    # Find the abs maximum value of the signal
    max_val = np.max(np.abs(audio))
    # Normalization of the signal between -1 and 1
    normAudio = audio / max_val if max_val != 0 else audio
    return normAudio
    
def folder_creation(path,folder):
    global user_id
    full_path = os.path.join(path,folder)
    if not os.path.exists(full_path):
        os.makedirs(full_path)
        print(f"Created {full_path}")
    else:
        print(f"Already exist : {full_path}")
    if folder==user_id:
            current_time = datetime.datetime.now(pytz.timezone('Europe/Rome'))
            with open(full_path+f"/user_{user_id}_data.txt", 'a') as file:
                file.write("-------------------------------------------------------------------"+'\n'+user_id+'_'+str(current_time)+'\n')
    return full_path

def path_exist(input_path):
    if os.path.exists(input_path):
        return input_path
    else:
        return None
        
def change_data(path_data,w,id):
    with open(path_data, 'r') as file:
        first_line = file.readline()
        first_line = first_line.strip()
        words = first_line.split('_')
        new_word = w
        words[id] = new_word
    with open(path_data, 'w') as file:
        new_line = '_'.join(words)
        file.write(new_line)

def folder_string_counter(directory, string):
    count = 0
    for entry in os.listdir(directory):
        full_path = os.path.join(directory, entry)
        if os.path.isdir(full_path):
            if string in entry:
                count += 1
    return count

def folder_file_counter(directory):
    k=0
    if path_exist(directory) is not None:
        for file_name in os.listdir(directory):
            k=k+1
        return k
    else:
        return None

def model_available_setup(model_path):
    m = []
    if path_exist(model_path+"/trained_pipeline") is None:
        m = ['AudioLDM2 Model']
    else:
        m = ['AudioLDM2 Model','My Model']
    return m

def folder_available_setup(directory):
    f = []
    n = []
    if path_exist(directory) is not None:
        for file_name in os.listdir(directory):
            #print(file_name)
            f.append(os.path.join(directory, file_name))
            n.append(file_name)
        return f,n
    else:
        return None

def file_show(directory):
    n = []
    if path_exist(directory) is not None:
        for file_name in os.listdir(directory):
            #print(file_name)
            n.append(file_name)
        return n
    else:
        return None
#------------------------------------------------------------------------------------------------------------------
#------------------------------------------------INITIALIZATION----------------------------------------------------
#------------------------------------------------------------------------------------------------------------------
print("\n--------------------------------------------USER ID-------------------------------------------------------")
print(f"\nUSER ID : {user_id}\n")
print("\n----------------------------------------GPU AVAILABILITY---------------------------------------------------")
gpu_availability_function()
CUDA_LAUNCH_BLOCKING=1
print("-----------------------------------------------------------------------------------------------------------")
print("\n----------------------------------------INITIALIZATION---------------------------------------------------")
user_path = folder_creation(path_experiments,user_id)
user_txt_path = user_path+f"/user_{user_id}_data.txt"
sounds_id_path = folder_creation(user_path,"Generated_Sounds")
model_path = folder_creation(user_path,"Model")
models = model_available_setup(model_path)
sounds_id = folder_file_counter(sounds_id_path)
data = folder_string_counter(user_path,"Training_Data")
train_status = 0
generation_number = 0
print("---------------------------------------------------------------------------------------------------------")
print(f"Models  available: {models}")
foldersData, folderNames = folder_available_setup("/nas/home/gperego/projects/audiocraft/UserExperience/UserDatasets")
print(f"Number of dataset available : {folderNames}")
print("---------------------------------------------READY---------------------------------------------------------")


--------------------------------------------USER ID-------------------------------------------------------

USER ID : 111111


----------------------------------------GPU AVAILABILITY---------------------------------------------------
GPU Avaialbility : True
Version : 11.7
Number of GPU available: 3
GPU 0: NVIDIA TITAN RTX
 -Total memory: 23.65 GB
 -Busy memory: 0.00 GB
 -Busy cache memory: 0.00 GB
 -Free memory: 23.65 GB
GPU 1: NVIDIA TITAN RTX
 -Total memory: 23.65 GB
 -Busy memory: 0.00 GB
 -Busy cache memory: 0.00 GB
 -Free memory: 23.65 GB
GPU 2: NVIDIA TITAN RTX
 -Total memory: 23.65 GB
 -Busy memory: 0.00 GB
 -Busy cache memory: 0.00 GB
 -Free memory: 23.65 GB
-----------------------------------------------------------------------------------------------------------

----------------------------------------INITIALIZATION---------------------------------------------------
Already exist : /nas/home/gperego/projects/audiocraft/UserExperience/111111
Created /nas/home/gperego/projec

In [3]:
# GENERATION
#FUNCTIONS

def load_sound(input_string):
    model_id = model_audioldm2
    if len(input_string)>=2:
        #Loading Model
        if (model_available.value == "My Model" and path_exist(user_path+"/Model/trained_pipeline") ):
            model_id = path_exist(user_path+"/Model/trained_pipeline")
        else:
            model_id = model_audioldm2
        pipe = AudioLDM2Pipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
        #Generation of sound
        w = pipe(input_string,num_inference_steps=200,num_waveforms_per_prompt=num_waves.value,audio_length_in_s=audio_lenght.value).audios
        #Normalization of sound
        for i,x in enumerate(w):
            w[i] = norm_audio(x)   
        return w
    else:
        return None

def clear_text(b):
    text.value = ''
    
def clear_output(b):
    output_generation.clear_output()
    
def on_button_generation(b):
    global sounds_id_path, user_id, sounds_id
    output_generation.clear_output()
    with output_generation:
        rate = SAMPLE_RATE
        audios = load_sound(text.value)
        if audios is not None:
            output_generation.clear_output()
            #Name_modelName_numberOfWaveformsGenerated_lenghtInSeconds
            
            with open(user_path+f"/user_{user_id}_data.txt", 'a') as file:
                    file.write(f"GENERATION\n")

            for i,x in enumerate(audios):
                if (sounds_id < 10):
                    name = f"G0{sounds_id}-{i+1}-{text.value}"
                else:
                    name = f"G{sounds_id}-{i+1}-{text.value}"
                    
                print(name)
                audio_widget = Audio(data=x,rate=rate)

                with open(user_txt_path, 'a') as file:
                    #Name_modelName_numberOfWaveformsGenerated_lenghtInSeconds
                    file.write(f"{name}_{model_available.value}_{num_waves.value}_{audio_lenght.value}\n")

                    
                scipy.io.wavfile.write(sounds_id_path+"/"+name+".wav",rate=rate,data=x)
                display(audio_widget)
                sounds_id = sounds_id+1
        else:
            print("Prompt not valid!!!")
    
# GUI OBJECTS           
output_generation = widgets.Output(layout={'border': '2px solid white'})   
output_evaluation = widgets.Output(layout={'border': '2px solid white'}) 

text = widgets.Textarea(value='',placeholder='...',description='',disabled=False,rows=2,layout={'width': '500px'})
clear_text_button = widgets.Button(description="Delete text")
clear_output_button = widgets.Button(description="Clear output")
empty_button = widgets.Button(description="", disabled=True)

generate_sound_button = widgets.Button(description="Generate",layout={'border': '2px solid green'})
model_available = Dropdown(options=models, description='')
num_waves = widgets.IntSlider(description=' ', min=1, max=5, step=1, value=3)
audio_lenght = widgets.IntSlider(description=' ', min=2.0, max=10.0, step=1.0, value=5.0)

horizontal_text_box = widgets.HBox([text,generate_sound_button,clear_output_button])
horizontal_generation_box = widgets.HBox([model_available,num_waves,audio_lenght])


# EVENTS   
clear_text_button.on_click(clear_text)
clear_output_button.on_click(clear_output)
generate_sound_button.on_click(on_button_generation)

#------------------------------------------------------------------------------------------------------------------------------------------

In [4]:
# FINE-TUNING
#FUNCTIONS
check_file_button = 0

def f(x):
    return x

def f_1_elements(a):
    print((a))

def f_2_elements(a, b):
    print((a, b)) 
    
def f_3_elements(a, b, c):
    print((a, b, c)) 

def f_4_elements(a,b,c,d):
    print((a,b,c,d))

def f_5_elements(a,b,c,d,e):
    print((a,b,c,d,f"Dataset: {e}"))


def handle_upload(b):
    global check_file_button
    output_upload.clear_output()
    check_file_button = check_file_button + 1
    check_file_button = check_file_button % 2
    if check_file_button == 1:
        button_upload.description="Hide files"
        #button_upload.layout={'border': '2px solid red'}
        a = []
        l = []
        dataset_index = folderNames.index(folders_available.value)
        DATA_DIR0 = foldersData[dataset_index]

        for i,file_name in enumerate(os.listdir(DATA_DIR0)):
            print(f'{i + 1}: {file_name}')
            audio_data, _ = librosa.load(DATA_DIR0+"/"+file_name, sr=SAMPLE_RATE)
            audio_data = norm_audio(audio_data)
            a.append(audio_data)
            l.append(file_name)
        with output_upload:
            if len(a)==0:
                print("No files selected...")
                return
            print("------------------------------------------")
            for i,x in enumerate(a):
                print(f"0{i+1}-{l[i]}")
                display(Audio(x, rate=SAMPLE_RATE))
    else:
        output_upload.clear_output()
        button_upload.description="Listen files"
       
def strat_train_on_click(b):
    global data , train_status
    
    audios = []
    labels = []
    dataset_index = folderNames.index(folders_available.value)
    DATA_DIR0 = foldersData[dataset_index]

    for i,file_name in enumerate(os.listdir(DATA_DIR0)):
        print(f'{i + 1}: {file_name}')
        audio_data, _ = librosa.load(DATA_DIR0+"/"+file_name, sr=SAMPLE_RATE)
        audio_data = norm_audio(audio_data)
        audios.append(audio_data)
        labels.append(file_name)
    
    if (len(audios)>=1 and len(audios)<=5) and (len(instance.value)>=3) and (len(object_class.value)>=3):

        train_status = train_status + 1
        
        #Folder creation
        DATA_DIR = folder_creation(user_path,f"Training_Data_{data}")
        dataset_index = folderNames.index(folders_available.value)
        DATA_DIR0 = foldersData[dataset_index]
        data = data+1

        #Save Audios
        for i,a in enumerate(audios):
            w = norm_audio(a)
            scipy.io.wavfile.write(DATA_DIR + f"/{i+1}-{labels[i]}.wav", rate=SAMPLE_RATE, data=w)
        #Define model to train
        if model_name.value == "My Model":
            MODEL_NAME = model_path+"/trained_pipeline"
        else:
            MODEL_NAME = "cvssp/audioldm2"
        #Define output folder
        OUTPUT_DIR = model_path
        #Define train parameters
        INSTANCE_WORD = instance.value
        OBJECT_CLASS = object_class.value
        LEARNING_RATE = 4.0 * 10.0 ** (-5)
        if training_type.value=='Fast':
            TRAIN_STEPS = 100
        elif training_type.value == 'Medium':
            TRAIN_STEPS = 200
        elif training_type.value == 'Slow':
            TRAIN_STEPS = 500
        else:
            TRAIN_STEPS = 5
            LEARNING_RATE = 2.0 * 10.0 ** (-2)
            
        with output_training:
            output_training.clear_output()
            print(f"START TRAINING")
            print(dataset_index)
            print(DATA_DIR0)
            
            train(True,MODELNAME=MODEL_NAME,DATAPATH=DATA_DIR,OUTPATH=OUTPUT_DIR,
              INSTANCEWORD=INSTANCE_WORD,OBJECTCLASS=OBJECT_CLASS,TRAINSTEPS=TRAIN_STEPS,
              NUMVECTORS=4,LEARNINGRATE=LEARNING_RATE,TRAINBATCHSIZE=1,GRADIENTACCUMULATION=2)
                
            print(f"----------------------------------------------------------------------------------------------")
            output_training.clear_output()
            print(f"CHECK...")
            p = path_exist(user_path+"/Model/trained_pipeline")
            if (p != None) and ("My Model" not in model_name.options):
                model_name.options = list(model_name.options) + ["My Model"]
                model_available.options = list(model_available.options) + ["My Model"]
            button_train.disabled=False
            output_training.clear_output()

            with open(user_path+f"/user_{user_id}_data.txt", 'a') as file:
                    file.write(f"Training_{INSTANCE_WORD}_{OBJECT_CLASS}_{model_name.value}_{training_type.value}_{len(audios)}-audios\n")
                
            print(f"END TRAINING : train status = {train_status}")
            if train_status==2:
                print("For train again the model, please restart the kernel: Run -> Restart Kernel and Run All Cells... -> Restart ")
                button_train.disabled=True
            else:
                print("'My Model' is available")
                
    else:
        with output_training:
            output_training.clear_output()
            print(f"ERROR : input parameters not corrected = {train_status}")

def uploaded_files(change):
    global check_file_button
    output_upload.clear_output()
    output_training.clear_output()
    with output_upload:
        output_upload.clear_output()
        button_upload.disabled = False
        button_upload.description="Show files"
        button_upload.layout={'border': '2px solid green'}
        check_file_button = 0
        button_train.disabled = False
        button_train.description="Start Fine-Tuning"
        file_dict = upload_widget.value
        num_file = len(file_dict)
        print(f"Number of files: {num_file}")
        if num_file > 0:
            print("--------------------------------------------------------------------------------")
            for i,x in enumerate(upload_widget.value):
                print(f"{i+1}-{x['name']}")
                  
#OBJECTS    

upload_widget = widgets.FileUpload(accept='.wav, .mp3',multiple=True)

output_upload = widgets.Output(layout={'border': '0px solid black'}) 
output_training = widgets.Output(layout={'border': '2px solid white'}) 

button_upload = widgets.Button(description="Listen files",disabled=False)
button_space = widgets.Button(description='', disabled=True)
button_train = widgets.Button(description="Start",disabled=False,layout={'border': '2px solid lightblue'})

instance = widgets.Text(value='',placeholder='...',description='',disabled=False)
object_class = widgets.Text(value='',placeholder='...',description='',disabled=False)
model_name = Dropdown(options=models, description='')
training_type = Dropdown(options=['Fast','Medium','Slow'], description='')
folders_available = Dropdown(options=folderNames, description='')

train_prompt_setup_out = widgets.interactive_output(f_5_elements, {'a': instance, 'b': object_class, 'c': model_name, 
                                                                   'd': training_type, 'e' : folders_available})

horizontal_box_prompt = widgets.HBox([instance, object_class, model_name,training_type])
horizontal_box_train_values = widgets.HBox([train_prompt_setup_out])
horizontal_box_upload = widgets.HBox([button_train,button_space,folders_available,button_upload])

# EVENTS   
button_upload.on_click(handle_upload)
button_train.on_click(strat_train_on_click)
upload_widget.observe(uploaded_files, names='value')


# P.A.G.U.R.I. - User GUIDE
_______________________________________________________________________________________________________________________ 
- **GENERATE YOUR AUDIOS FROM TEXT**
  - Insert your text into *INPUT PROMPT* text-box.
  - Select the model generator from *MODELS AVAILABLE* list.
  - Select the number of waveform to generate using the *NUMBER OF WAVES PER PROMPT* slider.
  - Select the duration of audios using the *AUDIO DURATION IN SECONDS* slider.
  - Click on "GENERATE" button to generate music from your input prompt text.
  - Click on *CLEAR OUTPUT* button to clear the audio files generated window.
_______________________________________________________________________________________________________________________    
- **TRAIN YOUR MODEL WITH YOUR OWN AUDIOS**
  - Insert an Instance Word (at LEAST three characters) in *INSTANCE WORD* text-box, for satisfy the model association between text prompt and output music.
  - Insert an Object Class (at LEAST three characters) in *OBJECT CLASS* text-box, for satisfy the model association between text prompt and output music.
  - Select the model generator you want to fine-tune from * MODELS AVAILABLE* list.
  - Select the kind of training procedure from *FINE-TUNING TIME* list (Fast,Medium,Slow).
  - Select the dataset of music to personalize by choosing one from those available in "SOUND DATASET AVAILABLE" list.
  - Click on *LISTEN FILES* button to see and listen to your audio samples from the music dataset selected.
  - Click on *HIDE FILES* button to visualize the names of your audio samples from the music dataset selected.
  - Click on *START* button to start the fine-tuning of the model with your personal audio samples data.
_______________________________________________________________________________________________________________________ 
- **SUGGESTIONS FOR A CORRECT USE**
  - Perform one step at a time: don’t be hasty
  - Fine-tuning accepts only 3 ~ 5 files, no more and no less
  - Fine-tuning time : Fast : 5 - 7 min ; Medium : 10 - 13 min ; Slow : 18 - 20 min
  - Choose a not common word as INSTANCE WORD for the fine-tuning of the model.
  - To properly use the personalized model you created: select "My Model" from *MODELS AVAILABLE* list and insert in the *INPUT PROMPT* text-box
    a sentence that includes INSTANCE WORD and OBJECT CLASS used for fine-tuning the model

    Prompt example: *" a sound of sks electric guitar"* , where : INSTANCE WORD = "sks" and OBJECT CLASS = "electric guitar"
_______________________________________________________________________________________________________________________ 
- **CREDITS**
  
  This code is heavily based on : *Investigating Personalization Methods in Text to Music Generation Generation*
  
  Paper reference : (https://arxiv.org/abs/2309.11140)
  
  Github reference : (https://github.com/zelaki/DreamSound)

  
     

In [5]:
# P.A.G.U.R.I DISPLAY
description_generate = f"GENERATE YOUR MUSIC FROM TEXT"
generation_descriptor = widgets.Label(value=description_generate,layout={'border': '2px solid grey'})
display(generation_descriptor)
print("INPUT PROMPT                                                             GENERATE SOUND       CLEAR OUTPUT")
display(horizontal_text_box)
print("\nMODELS AVAILABLE                                       NUMBER OF AUDIO PER PROMPT                AUDIO DURATION IN SECONDS")
display(horizontal_generation_box)
display(output_generation)

# DISPLAY
description_train = f"CREATE A PERSONALIZED MODEL WITH YOUR AUDIO"
generation_train = widgets.Label(value=description_train,layout={'border': '2px solid lightblue'})
display(generation_train)
print("INSTANCE WORD                         OBJECT CLASS                           MODELS AVAILABLE                      FINE-TUNING TIME")
display(horizontal_box_prompt)
display(horizontal_box_train_values)
print("\nCREATE YOUR MODEL                         SOUND DATASET AVAIABLE                     LISTEN-HIDE AUDIO FILES")
display(horizontal_box_upload)
display(output_upload)
display(output_training)

Label(value='GENERATE YOUR MUSIC FROM TEXT', layout=Layout(border_bottom='2px solid grey', border_left='2px so…

INPUT PROMPT                                                             GENERATE SOUND       CLEAR OUTPUT


HBox(children=(Textarea(value='', layout=Layout(width='500px'), placeholder='...', rows=2), Button(description…


MODELS AVAILABLE                                       NUMBER OF AUDIO PER PROMPT                AUDIO DURATION IN SECONDS


HBox(children=(Dropdown(options=('AudioLDM2 Model', 'My Model'), value='AudioLDM2 Model'), IntSlider(value=3, …

Output(layout=Layout(border_bottom='2px solid white', border_left='2px solid white', border_right='2px solid w…

Label(value='CREATE A PERSONALIZED MODEL WITH YOUR AUDIO', layout=Layout(border_bottom='2px solid lightblue', …

INSTANCE WORD                         OBJECT CLASS                           MODELS AVAILABLE                      FINE-TUNING TIME


HBox(children=(Text(value='', placeholder='...'), Text(value='', placeholder='...'), Dropdown(options=('AudioL…

HBox(children=(Output(),))


CREATE YOUR MODEL                         SOUND DATASET AVAIABLE                     LISTEN-HIDE AUDIO FILES


HBox(children=(Button(description='Start', layout=Layout(border_bottom='2px solid lightblue', border_left='2px…

Output(layout=Layout(border_bottom='0px solid black', border_left='0px solid black', border_right='0px solid b…

Output(layout=Layout(border_bottom='2px solid white', border_left='2px solid white', border_right='2px solid w…