## Pipeline for pré-processing FMA Large dataset- Downloaded in: https://github.com/mdeff/fma


In [41]:
import os
import csv
import json
import pandas as pd
import numpy as np
import tensorflow as tf
from math import ceil
from sklearn.utils import shuffle

from joblib import Parallel, delayed

import multiprocessing
from tqdm import tqdm

from essentia.standard import MonoLoader, TensorflowPredictEffnetDiscogs
from sklearn.model_selection import train_test_split


In [42]:
from tqdm.notebook import tqdm

In [43]:
tqdm.pandas()

In [44]:

args = pd.Series({
    "root_dir":"/mnt/disks/data/",
    "dataset_path":"/mnt/disks/data/fma/fma_large",
    "embeddings":"music_style",
    "train_id": "hierarchical_mini_test",
    'sample_size':0.01
})


In [45]:



job_path = "/mnt/disks/data/fma/trains"


# In[15]:


train_path = os.path.join(job_path,args.train_id)


# In[16]:


base_path = os.path.join(args.root_dir,"fma")


# In[17]:


models_path = os.path.join(args.root_dir,"models")


metadata_path_fma = os.path.join(base_path,"fma_metadata")


# In[18]:


metadata_file = os.path.join(train_path,"metadata.json")


labels_file = os.path.join(train_path,"labels.json")


In [46]:

def create_dir(path):
    # checking if the directory demo_folder2 
    # exist or not.
    if not os.path.isdir(path):

        # if the demo_folder2 directory is 
        # not present then create it.
        os.makedirs(path)
    return True



In [47]:
create_dir(train_path)

True

In [34]:

if args.embeddings == "music_style":
    model_path = os.path.join(models_path,args.embeddings,"discogs-effnet-bs64-1.pb")



In [35]:


def __load_json__(path):
    with open(path, 'r') as f:
        tmp = json.loads(f.read())

    return tmp




In [36]:
import pandas as pd
import os

In [138]:
df = df[['track_id','file_path','labels']]

In [61]:
df.columns

Index(['track_id', 'file_path', 'labels'], dtype='object')

In [62]:
### Exemplo de extração de features
audio = MonoLoader(filename=df.iloc[1].file_path, sampleRate=16000)()
model = TensorflowPredictEffnetDiscogs(graphFilename=model_path,output="PartitionedCall:1")
activations = model(audio)

2023-03-20 11:24:30.315634: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:65:00.0 name: NVIDIA GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.65GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2023-03-20 11:24:30.315913: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0
2023-03-20 11:24:30.315935: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:
2023-03-20 11:24:30.315939: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264]      0 
2023-03-20 11:24:30.315943: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0:   N 
2023-03-20 11:24:30.316214: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 9493 MB memory) -> physical GPU (device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:6

In [63]:
activations = model(audio)

2023-03-20 11:24:55.705037: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:65:00.0 name: NVIDIA GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.65GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2023-03-20 11:24:55.705794: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0
2023-03-20 11:24:55.705822: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:
2023-03-20 11:24:55.705828: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264]      0 
2023-03-20 11:24:55.705832: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0:   N 
2023-03-20 11:24:55.706154: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 9493 MB memory) -> physical GPU (device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:6

In [64]:
activations.shape

(29, 1280)

In [66]:
groups = df.groupby("labels")


In [72]:

def __split_data__(group, percentage=0.1):
    if len(group) == 1:
        return group, group

    shuffled = shuffle(group.values)
    finish_test = int(ceil(len(group) * percentage))

    first = pd.DataFrame(shuffled[:finish_test], columns=group.columns)
    second = pd.DataFrame(shuffled[finish_test:], columns=group.columns)

    return first, second


# In[32]:


def __split_data_sample(groups):
    dataset_trainset_path = os.path.join(train_path,"trainset.csv")
    dataset_testset_path = os.path.join(train_path,"testset.csv")
    dataset_validationset_path = os.path.join(train_path,"validationset.csv")
    
    
    X_train,y_train,X_test,y_test,X_val,y_val = (list(),list(),list(),list(),list(),list())
    for code, group in groups:
        
        test, train_to_split  = __split_data__(group, 0.05) # 10%
        validation, train = __split_data__(train_to_split, 0.05) # %10
        #rint(test)
        
        X_train.append(train)
        X_test.append(test)
        X_val.append(validation)
        
    X_train = pd.concat(X_train, sort=False).sample(frac=1).reset_index(drop=True)
    X_train.to_csv(dataset_trainset_path, index=False, quoting=csv.QUOTE_ALL)
    print(dataset_trainset_path)
    
    X_test = pd.concat(X_test, sort=False).sample(frac=1).reset_index(drop=True)
    X_test.to_csv(dataset_testset_path, index=False, quoting=csv.QUOTE_ALL)
    print(dataset_testset_path)

    X_val = pd.concat(X_val, sort=False).sample(frac=1).reset_index(drop=True)
    X_val.to_csv(dataset_validationset_path, index=False, quoting=csv.QUOTE_ALL)
    print(dataset_validationset_path)
    
    return X_train,X_test,X_val



In [73]:


X_train,X_test,X_validation = __split_data_sample(groups)


TypeError: unhashable type: 'list'

In [30]:

def extract_feature(file_path,model):
    ### Configuração do model para extrair a representação do aúdio
    # model = TensorflowPredictEffnetDiscogs(graphFilename=model_path)
    audio = MonoLoader(filename=file_path, sampleRate=16000)()
    activations = model(audio)
    return activations


# In[44]:


def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a floast_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
  array = tf.io.serialize_tensor(array)
  return array


def parse_labels(example):
    label_list = example['labels'].values
    labels = tf.fill([16], -1)  # preencher com -1 no caso de haver menos de 16 níveis
    for i, label in enumerate(label_list):
        if i == 16:
            break
        # Aqui você pode fazer a conversão do label em um índice inteiro usando um dicionário ou outro método
        # Neste exemplo, estou apenas usando a posição da label na lista como índice
        labels[i] = i
    return labels


def parse_single_music(data,music,labels):
    cat1 = data
    
    label1 = np.array(cat1, np.int64)
    
    
    
    #define the dictionary -- the structure -- of our single example
    data = {
        'emb' : _bytes_feature(serialize_array(music)),
        'label' : _int64_feature(label1)
    }
    #create an Example, wrapping the single features
    out = tf.train.Example(features=tf.train.Features(feature=data))

    return out
# In[45]:




In [31]:

def generate_tf_records(df,labels,model,filename="train"):
    
    tfrecords_path = os.path.join(train_path,"tfrecords",filename)
    
    create_dir(tfrecords_path)
    
    
    batch_size = 1024 * 10  # 10k records from each file batch
    count = 0
    total = ceil(len(df) / batch_size)
    
    for i in range(0, len(df), batch_size):
        batch_df = df[i:i+batch_size]
        
        tqdm.pandas()
        
        X = batch_df.file_path.progress_apply(lambda x: extract_feature(x,model))   
        
        print("Extraiu as features")
        
        batch_df = batch_df[['first_genre_id_label']]
        
        
        
        tfrecords = [parse_single_music(data, x,labels) for data, x in zip(batch_df.values, X)]
        
        path = os.path.join(tfrecords_path,f"{str(count).zfill(10)}.tfrecord")

        #with tf.python_io.TFRecordWriter(path) as writer:
        with tf.io.TFRecordWriter(path) as writer:
            for tfrecord in tfrecords:
                writer.write(tfrecord.SerializeToString())

        print(f"{count} {len(tfrecords)} {path}")
        count += 1
        print(f"{count}/{total} batchs / {count * batch_size} processed")

    print(f"{count}/{total} batchs / {len(df)} processed")



In [25]:
df.file_path.iloc[2]

'/mnt/disks/data/fma/fma_large/016/016598.mp3'

In [26]:
model_path

'/mnt/disks/data/models/music_style/discogs-effnet-bs64-1.pb'

In [27]:
model

<essentia.standard._create_essentia_class.<locals>.Algo at 0x7f43481f6e00>

In [28]:

extract_feature(df.file_path.iloc[2],model)



KeyboardInterrupt



In [None]:

dataset_names = ["train","test","validation"]

datasets = [X_train,X_test,X_validation]

In [29]:
X_train

Unnamed: 0,track_id,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,...,lyricist,number,publisher,tags.2,title.1,first_genre_id,first_genre_id_label,full_genre_id,file_path,valid
0,108620,0.0,2014-08-29 11:01:51,2014-08-29 00:00:00,,0.0,16882.0,"<p>Dear friends, here we are with our 27th edi...",33851.0,,...,,3.0,,[],Nr.13b,1,1,1-38-0,/mnt/disks/data/fma/fma_large/108/108620.mp3,True
1,68335,0.0,2012-07-25 14:44:59,2012-07-16 00:00:00,Irene Trudel,0.0,12205.0,<p>After two endearing albums of sweet-voiced ...,6547.0,Irene Trudel,...,,1.0,,[],The Lake,10,10,10-0,/mnt/disks/data/fma/fma_large/068/068335.mp3,True
2,77749,0.0,2013-02-08 11:52:20,2013-02-08 00:00:00,,3.0,13465.0,"<p><strong>Want high-resolution album art, lin...",32616.0,,...,,56.0,,[],Gladys (Single Mix),1,1,1-38-0,/mnt/disks/data/fma/fma_large/077/077749.mp3,True
3,122560,0.0,2015-07-14 17:01:12,2015-07-14 00:00:00,,0.0,18550.0,"<p><strong style=""color: #5a5954; font-family:...",5536.0,,...,,2.0,,['experimental'],Conduct,15,15,15-0,/mnt/disks/data/fma/fma_large/122/122560.mp3,True
4,26811,0.0,2010-03-08 20:09:48,2007-10-01 00:00:00,,0.0,5819.0,<p>Second EP by French trio. Released on vinyl...,2561.0,,...,,4.0,,[],Yes Madame,1,1,1-38-0,/mnt/disks/data/fma/fma_large/026/026811.mp3,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
897,16573,0.0,2009-07-13 13:01:30,2006-09-12 00:00:00,Gil Shuster,1.0,3798.0,<p>TLASILA stopped by the WFMU studios for a s...,615.0,Brian Turner,...,,2.0,,[],Authorized,1,1,1-38-0,/mnt/disks/data/fma/fma_large/016/016573.mp3,True
898,52000,0.0,2011-08-03 09:35:36,2011-07-25 00:00:00,,0.0,9773.0,"<p><span style=""font-family:Verdana, Arial, He...",4557.0,,...,,10.0,,[],Irie dub feat. Leah,79,47,79-2-0,/mnt/disks/data/fma/fma_large/052/052000.mp3,True
899,92506,0.0,2013-10-24 12:29:55,2013-10-05 00:00:00,Stu Rutherford,0.0,15304.0,<p>Spray Paint stops by WFMU's Cherry Blossom ...,7133.0,Terre T,...,,5.0,,[],Yawn Factory,12,12,12-0,/mnt/disks/data/fma/fma_large/092/092506.mp3,True
900,15886,0.0,2009-07-02 11:13:34,2009-07-02 00:00:00,Trent Wolbe,0.0,3631.0,"<p><strong><span style=""font-family: Verdana, ...",2702.0,Trouble,...,,5.0,,[],Lazy Lover,33,29,33-17-0,/mnt/disks/data/fma/fma_large/015/015886.mp3,True


In [None]:
with Parallel(n_jobs=3, require='sharedmem') as para:
    print("Estamos usando paralelismo!!!")
    para(delayed(generate_tf_records)(dataset,labels,model,dataset_name) for (dataset_name,dataset) in zip(dataset_names,datasets))



In [None]:

metadata = {
    "train_count":len(X_train),
    "test_count":len(X_test),
    "val_count":len(X_validation),
    "global_size": len(labels['global']),
    "root_dir":args.root_dir,
    "embeddings":args.embeddings,
    "train_id": args.train_id
}


# In[ ]:




# In[ ]:


with open(metadata_file, 'w+') as f:
    f.write(json.dumps(metadata))


# In[ ]:
