In [None]:
#FILE PREPROCESSING DATASET TEMPO
from zipfile import ZipFile
import xml.etree.cElementTree as et
from os import listdir,mkdir,system
from os.path import isfile, join, splitext,exists
from scipy.io import wavfile
from scipy import signal
import sys
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from madmom.audio.filters import MelFilterbank 
from pydub import AudioSegment
import os
import soundfile as sf

def unzip(path):
    
    for p in path:
        with ZipFile(p, 'r') as zipp:
            print('Extracting all the files now...')
            zipp.extractall()
            print('Done!')
        
#Function that extracts the annotation from the xml file of ExtendedBallroom dataset and standardize
#the annotations, indeed the result is having the folder "Ann" filled of txt files, containing each one the bpm of the related song
def ext_ann(path_ann):
    tree=et.parse('extendedballroom_v1.1.xml')
    root=tree.getroot()
    for n in range(len(root)):
        for x in root[n]:
            f=open(f"{path_ann}/{x.attrib['id']}.txt",'w')
            f.write(str(x.attrib['bpm']))
            f.close()

#this function is used for the Giantsteps dataset that containes the annotation written as file .bpm, these files are standardized
# as file .txt
def rename_file(path):
    #I delete the key file for faster stuff
    onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
    for f in onlyfiles:
        os.rename(f"{path}/{f}",f"{path}/{splitext(f)[0]}.txt")
        
        
def convert_to_wav(path):
    onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
    
    #let's first create the directory for wav
    
    #for each file, convert it into wav
    for f in onlyfiles:
        sound = AudioSegment.from_mp3(f"{path}/{f}")
        dst = f"{path}/{splitext(f)[0]}.wav"
        sound.export(dst, format="wav")
        os.remove(f"{path}/{f}")
        

path_giant_ann="Dataset/Tempo/GS/Ann"
path_giant_audio="Dataset/Tempo/GS/Audio"
path_giant_image="Dataset/Tempo/GS/Image_spec"
path_ext_ann="Dataset/Tempo/EXT/Ann"
path_ext_audio="Dataset/Tempo/EXT/Audio"
path_ext_image="Dataset/Tempo/EXT/Image_spec"

In [None]:
ext_ann(path_ext_ann)

In [None]:
#all the audio files in this dataset are mp3, but for an optimal use it is preferable a "wav" format
convert_to_wav(path_ext_audio)
convert_to_wav(path_giant_audio)

In [None]:
#It is preferable to have everything standardized, even the extension of the annotation
rename_file(path_giant_ann)

In [None]:
def augmentation_data(path_audio,path_ann):
    #1st data augmentation, using librosa let's accelerate/decelerate the speed, consequently the bpm
    #by shifting +20% and -20% the original speed
    onlyfiles = [f for f in listdir(path_audio) if isfile(join(path_audio, f))]
    for f in onlyfiles:
        a=open(f"{path_ann}/{splitext(f)[0]}.txt","r")
        bpm=a.readlines()
        a.close()
        y,sr=librosa.load(f"{path_audio}/{f}")
        y_s = librosa.effects.time_stretch(y, 1.2)
        y_f =librosa.effects.time_stretch(y, 0.8)
        sf.write(f'{path_audio}/f_{f}', y_f, sr,'PCM_24')
        sf.write(f'{path_audio}/s_{f}', y_s, sr,'PCM_24')
        a=open(f"{path_ann}/f_{splitext(f)[0]}.txt","w")
        a.write(str(float(bpm[0])*2))
        a.close()
        a=open(f"{path_ann}/s_{splitext(f)[0]}.txt","w")
        a.write(str(float(bpm[0])/2))
        a.close()
    #2nd augmentation: cut the long audio into many pieces of audio
    #it is needed just 15 sec of audio 
    #in this way we standardize all the audio in the time frame (15 sec)
    onlyfiles = [f for f in listdir(path_audio) if isfile(join(path_audio, f))]
    for f in onlyfiles:
        y,sr=librosa.load(f"{path_audio}/{f}")
        if len(y)>sr*15:
            count=1
            #memorize the bpm
            a=open(f"{path_ann}/{splitext(f)[0]}.txt","r")
            bpm=a.readlines()
            a.close()
            while True:
                if(sr*15*count<=len(y)):
                    
                    y_new=y[sr*15*(count-1):sr*15*count]
                    sf.write(f'{path_audio}/{splitext(f)[0]}_{count}.wav', y_new, sr,'PCM_24')
                    if count==1: #rename the original file
                        os.rename(f"{path_ann}/{splitext(f)[0]}.txt", f"{path_ann}/{splitext(f)[0]}_{count}.txt")
                    else:
                        a=open(f"{path_ann}/{splitext(f)[0]}_{count}.txt","w")
                        a.write(str(bpm[0]))
                        a.close()
                    count+=1
                else:
                    break
                    
                    
            os.remove(f"{path_audio}/{f}")
            
    
augmentation_data(path_giant_audio,path_giant_ann)
augmentation_data(path_ext_audio,path_ext_ann)

In [None]:
#this is the preprocessing, the computation of log_mel_spectrogram
def create_image_spec(path_audio,path_img):
    #A good representation for the audio is in its mel-spectrogram form
    #It is decided to compute the log_mel_spectrgram of each audio 
    onlyfiles = [f for f in listdir(path_audio) if isfile(join(path_audio, f))]
    for f in onlyfiles:
        join(path,"wav")
        if not exists(f"{path_img}/{splitext(f)[0]}.jpeg"):
            y,sr=librosa.load(f"{path_audio}/{f}",sr=22050)
            ps = librosa.feature.melspectrogram(y=y,n_fft=1024, hop_length=1024//2,sr=sr,power=1,n_mels=40, fmin=20, fmax=5000)
            mels = np.log(ps + 1e-9)
            mels = librosa.power_to_db(ps, ref=np.max)
            fig, ax = plt.subplots()
            img = librosa.display.specshow(mels, sr=11025)
            plt.savefig(f"{path_img}/{splitext(f)[0]}.jpeg")
            
create_image_spec(path_giant_audio,path_giant_image)
create_image_spec(path_ext_audio,path_ext_image)