In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import ast
import os
import csv
import math
from sklearn.utils import shuffle
from math import ceil
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

2023-11-26 11:30:41.409878: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tqdm.pandas()

In [15]:

args = pd.Series({
    "root_dir":"/mnt/disks/data/",
    "dataset_path":"/mnt/disks/data/fma/fma_large",
    "embeddings":"music_style",
    "sequence_size": 1280,
    "train_id": "hierarchical_multilabel_sample",
    'sample_size': 0.1
})


In [16]:

base_path = "/mnt/disks/data/fma/trains"


job_path = os.path.join(base_path,args.train_id)


tfrecord_path = os.path.join(job_path,"tfrecords")

# In[16]:

base_path = os.path.join(args.root_dir,"fma")

# In[17]:

models_path = os.path.join(args.root_dir,"models")


metadata_path_fma = os.path.join(base_path,"fma_metadata")

# In[18]:

metadata_path = os.path.join(job_path,"metadata.json")


categories_labels_path = os.path.join(job_path,"labels.json")


In [17]:


def __load_json__(path):
    with open(path, 'r') as f:
        tmp = json.loads(f.read())

    return tmp




In [18]:

def create_dir(path):
    # checking if the directory demo_folder2 
    # exist or not.
    if not os.path.isdir(path):

        # if the demo_folder2 directory is 
        # not present then create it.
        os.makedirs(path)
    return True



In [21]:
import shutil
shutil.rmtree(job_path)

In [22]:
create_dir(job_path)

True

## Análise do tracks.csv

In [23]:
tracks = os.path.join(metadata_path_fma,"tracks_genres.csv")

In [24]:
df_tracks = pd.read_csv(tracks)

In [25]:
df_tracks.full_genre_id.iloc[0]

'[[21]]'

In [26]:
def extract_literal_val(labels):
    labels = [label for label in ast.literal_eval(labels)]
    return labels

In [27]:
df_tracks["full_genre_id"] = df_tracks.full_genre_id.apply(lambda x : extract_literal_val(x))

In [28]:
df_tracks = df_tracks.sample(frac=args.sample_size)

In [85]:
df_tracks.full_genre_id.value_counts()

full_genre_id
[[21]]                                            296
[[15]]                                            268
[[1, 38], [30, 38], [38], [41, 38], [247, 38]]    173
[[12]]                                            158
[[38]]                                            135
                                                 ... 
[[7, 20], [15], [181, 15]]                          1
[[10], [12], [103, 17]]                             1
[[14], [15], [38]]                                  1
[[58, 12], [76, 10], [311, 13]]                     1
[[311, 13], [362, 10]]                              1
Name: count, Length: 2530, dtype: int64

In [86]:
# Remover linhas com genre_id vazio
df_tracks = df_tracks[df_tracks['full_genre_id'].map(len) > 0]


In [87]:
df_tracks.iloc[150]

track_id                                                       96993
full_genre_id                      [[15, 183], [42, 15], [38], [15]]
file_path               /mnt/disks/data/fma/fma_large/096/096993.mp3
labels_genre_id    [[183, 15, 0, 0, 0], [15, 42, 0, 0, 0], [38, 0...
Name: 63225, dtype: object

In [88]:
def get_label_size(labels):
    return max([len(label) for label in labels])

In [89]:
labels_size = df_tracks.full_genre_id.apply(lambda x: get_label_size(x))

In [90]:
labels_size

86488    2
82099    2
50000    3
90646    2
72844    2
        ..
42354    2
71509    2
43957    2
93353    2
87671    1
Name: full_genre_id, Length: 10419, dtype: int64

In [91]:
labels_size.unique()

array([2, 3, 1, 4, 5])

In [92]:
labels_size = max(labels_size)

In [93]:
labels_size

5

### Parse of label to structure

In [62]:
### Function for parse label to sctructure of hierarhical scheme

def parse_label(label,label_size):
    # label = label.split('-')
    # preencher com 0 no caso de haver menos de 5 níveis
    labels = np.zeros(label_size, dtype=int)
    for i, label in enumerate(label[::-1]):
        if i == 5:
            break
        # Aqui você pode fazer a conversão do label em um índice inteiro usando um dicionário ou outro método
        # Neste exemplo, estou apenas usando a posição da label na lista como índice
        labels[i] = label
    return labels


def parse_labels(labels,label_size=5):
    cv_labels = []
    for label in labels:
        cv_labels.append(parse_label(label,label_size))
    return cv_labels
    

In [63]:
df_tracks.iloc[150]

track_id_                                               96993
full_genre_id               [[15, 183], [42, 15], [38], [15]]
file_path        /mnt/disks/data/fma/fma_large/096/096993.mp3
Name: 63225, dtype: object

In [64]:
df_tracks.full_genre_id.iloc[150][0][::-1]

[183, 15]

In [94]:
parse_labels(df_tracks.full_genre_id.iloc[150])

[array([183,  15,   0,   0,   0]),
 array([15, 42,  0,  0,  0]),
 array([38,  0,  0,  0,  0]),
 array([15,  0,  0,  0,  0])]

In [95]:
parsed_labels = df_tracks.full_genre_id.apply(lambda x: parse_labels(x))

In [96]:
parsed_labels

86488                                 [[21, 100, 0, 0, 0]]
82099             [[15, 286, 0, 0, 0], [21, 542, 0, 0, 0]]
50000             [[12, 25, 0, 0, 0], [12, 25, 109, 0, 0]]
90646    [[21, 0, 0, 0, 0], [2, 46, 0, 0, 0], [21, 539,...
72844    [[17, 0, 0, 0, 0], [17, 94, 0, 0, 0], [9, 137,...
                               ...                        
42354    [[12, 0, 0, 0, 0], [12, 25, 0, 0, 0], [12, 27,...
71509    [[1235, 18, 0, 0, 0], [15, 42, 0, 0, 0], [38, ...
43957                [[15, 0, 0, 0, 0], [10, 76, 0, 0, 0]]
93353                [[38, 0, 0, 0, 0], [38, 47, 0, 0, 0]]
87671                 [[15, 0, 0, 0, 0], [38, 0, 0, 0, 0]]
Name: full_genre_id, Length: 10419, dtype: object

In [97]:
df_tracks['labels_genre_id'] = parsed_labels

In [98]:
df_tracks.rename(columns={'track_id_':'track_id'}, inplace=True)

In [99]:
def get_unique_labels(all_labels):
    df = pd.DataFrame({'labels': []})
    cv_labels = []
    for labels in all_labels:
        if len(labels) > 1:
            cv_labels.append(labels[0])
            for label in labels:
                cv_labels.append(label)
    df['labels'] = cv_labels
    return df.labels.unique()

In [116]:
def get_unique_labels(all_labels,level=4):
    df = pd.DataFrame({'labels': []})
    cv_labels = []
    for labels in all_labels:
        for label in labels:
            if label[level] != 0:
                cv_labels.append(label[level])
    df['labels'] = cv_labels
    return df.labels.unique().tolist()


In [117]:
get_unique_labels(df_tracks.labels_genre_id.values,level=0)

[21, 15, 12, 2, 17, 9, 38, 4, 10, 5, 1235, 13, 3, 20, 14, 8, 183]

In [118]:
level_4 = {'level4': get_unique_labels(df_tracks.labels_genre_id.values)}
level_3 = {'level3': get_unique_labels(df_tracks.labels_genre_id.values,level=3)}
level_2 = {'level2': get_unique_labels(df_tracks.labels_genre_id.values,level=2)}
level_1 = {'level1': get_unique_labels(df_tracks.labels_genre_id.values,level=1)}
level_0 = {'level0': get_unique_labels(df_tracks.labels_genre_id.values,level=0)}

In [119]:
get_unique_labels(df_tracks.labels_genre_id.values,level=0)

[21, 15, 12, 2, 17, 9, 38, 4, 10, 5, 1235, 13, 3, 20, 14, 8, 183]

In [120]:
genres_df = pd.read_csv(os.path.join(metadata_path_fma,'genres.csv'))


In [121]:
genres_df.top_level.unique()

array([  38,    2,    3,    4,    5,   20,    8,    9,   10,   14,   12,
         13,   15,   17, 1235,   21])

In [124]:
def get_labels_name(labels, genres_df):
    full_name = []
    genre_root = ""
    for genre in labels:
        genre_df = genres_df[genres_df['genre_id'] == int(genre)]
        if genre_df.empty:
            genre_name = genre_root
        else:
            genre_name = genre_df.title.values.tolist()[0]
            genre_root = genre_name
        full_name.append(genre_name)
    full_name = '>'.join(full_name)
    return full_name

In [125]:
level4_name = get_labels_name(level_4['level4'], genres_df)

In [128]:
level3_name = get_labels_name(level_3['level3'], genres_df)
level2_name = get_labels_name(level_2['level2'], genres_df)
level1_name = get_labels_name(level_1['level1'], genres_df)
level0_name = get_labels_name(level_0['level0'], genres_df)

In [129]:
level0_name

'Hip-Hop>Electronic>Rock>International>Folk>Country>Experimental>Jazz>Pop>Classical>Instrumental>Easy Listening>Blues>Spoken>Soul-RnB>Old-Time / Historic>Glitch'

In [None]:
def __create_labels__(categories_df):
    data = {
        "label1": {},
        "label2": {},
        "label3": {},
        "label4": {},
        "label5": {},
        "label1_inverse": [],
        "label2_inverse": [],
        "label3_inverse": [],
        "label4_inverse": [],
        "label5_inverse": [],
        "label1_name": {},
        "label2_name": {},
        "label3_name": {},
        "label4_name": {},
        "label5_name": {},
    }

    idx = 0
    for id_x, cat in enumerate(set(categories_df.level1.values.tolist())):
        data['label1'][cat] = idx
        data['label1_inverse'].append(cat)
        data['label1_count'] = idx + 1
        idx += 1

    for id_x, cat in enumerate(set(categories_df.level2.values.tolist())):
        data['label2'][cat] = idx
        data['label2_inverse'].append(cat)
        data['label2_count'] = idx + 1
        idx += 1
    for id_x, cat in enumerate(set(categories_df.level3.values.tolist())):
        data['label3'][cat] = idx
        data['label3_inverse'].append(cat)
        data['label3_count'] = idx + 1
        idx += 1

    for id_x, cat in enumerate(set(categories_df.level4.values.tolist())):
        data['label4'][cat] = idx
        data['label4_inverse'].append(cat)
        data['label4_count'] = idx + 1
        idx += 1
    for idx, cat in enumerate(set(categories_df.level5.values.tolist())):
        data['label5'][cat] = idx
        data['label5_inverse'].append(cat)
        data['label5_count'] = idx + 1
        idx += 1
    for cat5, cat1, cat2, cat3, cat4, name5 in categories_df.values:
        name1 = '>'.join(name5.split('>')[:1])
        name2 = '>'.join(name5.split('>')[:2])
        name3 = '>'.join(name5.split('>')[:3])
        name4 = '>'.join(name5.split('>')[:4])
        
        data['label1_name'][cat1] = name1
        data['label2_name'][cat2] = name2
        data['label3_name'][cat3] = name3
        data['label4_name'][cat4] = name4
        data['label5_name'][cat5] = name5
    return data

In [None]:
with open(categories_labels_path, 'w+') as f:
    f.write(json.dumps(__create_labels__(categories_df)))

In [113]:
labels =__create_labels__(categories_df)

In [None]:
labels['label1']

{'337': 0,
 '92': 1,
 '85': 2,
 '442': 3,
 '619': 4,
 '188': 5,
 '13': 6,
 '651': 7,
 '113': 8,
 '502': 9,
 '42': 10,
 '2': 11,
 '810': 12,
 '176': 13,
 '296': 14,
 '32': 15,
 '167': 16,
 '89': 17,
 '174': 18,
 '240': 19,
 '27': 20,
 '88': 21,
 '539': 22,
 '404': 23,
 '180': 24,
 '1032': 25,
 '4': 26,
 '314': 27,
 '41': 28,
 '117': 29,
 '12': 30,
 '428': 31,
 '360': 32,
 '138': 33,
 '22': 34,
 '20': 35,
 '602': 36,
 '74': 37,
 '695': 38,
 '401': 39,
 '179': 40,
 '107': 41,
 '36': 42,
 '9': 43,
 '8': 44,
 '100': 45,
 '185': 46,
 '468': 47,
 '493': 48,
 '181': 49,
 '441': 50,
 '362': 51,
 '90': 52,
 '187': 53,
 '101': 54,
 '49': 55,
 '30': 56,
 '81': 57,
 '182': 58,
 '63': 59,
 '1235': 60,
 '189': 61,
 '111': 62,
 '125': 63,
 '186': 64,
 '58': 65,
 '38': 66,
 '297': 67,
 '808': 68,
 '456': 69,
 '76': 70,
 '542': 71,
 '19': 72,
 '444': 73,
 '71': 74,
 '567': 75,
 '359': 76,
 '247': 77,
 '46': 78,
 '538': 79,
 '169': 80,
 '440': 81,
 '514': 82,
 '1': 83,
 '77': 84,
 '3': 85,
 '16': 86,
 '1

In [None]:
labels['label1_count']

160

In [116]:
def parse_tfr_element(element):
    #use the same structure as above; it's kinda an outline of the structure we now want to create
    data = {
        'emb' : tf.io.FixedLenFeature([], tf.string),
        'track_id' : tf.io.FixedLenFeature([], tf.int64),
    }
    
    content = tf.io.parse_single_example(element, data)

    track_id = content['track_id']
    emb = content['emb']
    

    #get our 'feature'-- our image -- and reshape it appropriately
    feature = tf.io.parse_tensor(emb, out_type=tf.float32)
    return (feature, track_id)


def get_dataset(filename):
    #create the dataset
    dataset = tf.data.TFRecordDataset(filename)

    #pass every single feature through our mapping function
    dataset = dataset.map(
        parse_tfr_element
    )

    return dataset




In [117]:
import numpy as np


def load_dataset(path,dataset=args.embeddings):
    tfrecords_path = os.path.join(path,'tfrecords',dataset)
    tfrecords_path = [os.path.join(tfrecords_path, path) for path in os.listdir(tfrecords_path)]
    dataset = get_dataset(tfrecords_path)
    df = pd.DataFrame(
        dataset.as_numpy_iterator(),
        columns=['feature', 'track_id']
    )

    df.dropna(inplace=True)
    
    try:
        df.feature = df.feature.apply(lambda x: x[0] if x.shape[0] != 0 else None)
    except:
        print(x)
    return df


In [130]:
def __split_data__(group, percentage=0.1):
    if len(group) == 1:
        return group, group

    shuffled = shuffle(group.values)
    finish_test = int(ceil(len(group) * percentage))

    first = pd.DataFrame(shuffled[:finish_test], columns=group.columns)
    second = pd.DataFrame(shuffled[finish_test:], columns=group.columns)

    return first, second

def get_labels(labels_dict, labels, level='label1'):
    cv_labels = []
    for label in labels:
        cv_labels.append(labels_dict[level][label])
    return cv_labels
        

In [133]:
def select_dataset(df_tracks, labels_dict):
    
#     dataset_testset_path = os.path.join(tfrecord_path,'test')
#     dataset_validationset_path = os.path.join(tfrecord_path,'val')
#     dataset_trainset_path = os.path.join(tfrecord_path,'train')
    
    df = load_dataset(args.dataset_path, dataset=args.embeddings)

    df.dropna(inplace=True)

    df_tracks = df_tracks.merge(df, on='track_id')

    df_tracks.loc[:,'labels_1'] = df_tracks.labels_1.progress_apply(lambda x: get_labels(labels_dict, x, level='label1'))
    df_tracks.loc[:,'labels_2'] = df_tracks.labels_2.progress_apply(lambda x: get_labels(labels_dict, x, level='label2'))
    df_tracks.loc[:,'labels_3'] = df_tracks.labels_3.progress_apply(lambda x: get_labels(labels_dict, x, level='label3'))
    df_tracks.loc[:,'labels_4'] = df_tracks.labels_4.progress_apply(lambda x: get_labels(labels_dict, x, level='label4'))
    df_tracks.loc[:,'labels_5'] = df_tracks.labels_5.progress_apply(lambda x: get_labels(labels_dict, x, level='label5'))

    tests = []
    trains = []
    validations = []
    groups = df_tracks.groupby("labels_5")


    count = 0
    items_count = 0
    total = len(groups)
    total_items = len(df_tracks)
    oversampling_size = 30  # int(group_sizes.mean() + group_sizes.std() * 2)
    print(f"oversampling_size: {oversampling_size}")

    for code, group in groups:
        test, train_to_split = __split_data__(group, 0.01)  # 10%
        train_to_split = train_to_split
        validation, train = __split_data__(train_to_split, 0.01)  # %1

        tests.append(test)
        validations.append(validation)

        ## this increase the numner of samples when classes has low quantity
        count_train = len(train)
        if count_train < oversampling_size:
            train = train.sample(oversampling_size, replace=True)

        trains.append(train)

        count += 1
        items_count += count_train
        
    df_test = pd.concat(tests, sort=False).sample(frac=1).reset_index(drop=True)
    # .to_csv(dataset_testset_path, index=False,quoting=csv.QUOTE_ALL)
    df_val = pd.concat(validations, sort=False).sample(frac=1).reset_index(drop=True)
    df_train = pd.concat(trains, sort=False).sample(frac=1).reset_index(drop=True)

    return df_train, df_test, df_val

In [134]:
df_train, df_test, df_val = select_dataset(df_tracks, labels)

  0%|          | 0/104170 [00:00<?, ?it/s]

KeyError: '374'

In [60]:
df_train

Unnamed: 0,track_id,full_genre_id,labels_1,labels_2,labels_3,labels_4,labels_5,feature
0,115627,"[12, 25, 89]",8,135,164,373,84,"[0.021790445, 0.29853275, -0.026587656, 0.0666..."
1,146915,"[38, 41]",1,78,158,371,12,"[0.1287572, -0.04188484, -0.020825902, 0.01465..."
2,144872,"[38, 250]",1,92,218,385,126,"[0.084029794, -7.178386e-05, -0.053328514, -0...."
3,66741,"[1235, 18]",4,105,217,312,72,"[0.11808327, 0.10941779, 0.40051472, -0.049361..."
4,121728,"[1235, 107]",4,25,206,331,93,"[-0.03590433, 0.020767719, -0.03485803, -0.030..."
...,...,...,...,...,...,...,...,...
102226,96841,"[38, 41]",1,78,158,371,12,"[-0.03989613, -0.004323274, 0.02414892, 0.0396..."
102227,19591,[8],5,68,267,338,14,"[0.056298267, -0.018925885, -0.04787774, -0.01..."
102228,5166,[12],8,128,236,442,69,"[0.031009829, 0.012057076, -0.03138266, 0.0563..."
102229,106189,"[15, 286]",2,132,160,393,87,"[0.009615977, 0.12510824, 0.010890921, 0.00987..."


In [61]:
def _bytes_feature(value):
  ### Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

def _float_feature(value):
  ### Returns a floast_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64List_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _int64_feature(value):
  ###  Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
    array = tf.io.serialize_tensor(array)
    return array

In [62]:
def parse_single_music(data,labels):
    track_id, _, cat1, cat2, cat3, cat4, cat5, music = data
    
    
    label1 = np.array([cat1, labels['label1_count']], np.int64)
    label2 = np.array([cat2, labels['label2_count']], np.int64)
    label3 = np.array([cat3, labels['label3_count']], np.int64)
    label4 = np.array([cat4, labels['label4_count']], np.int64)
    label5 = np.array([cat5, labels['label5_count']], np.int64)
    
    
    
    #define the dictionary -- the structure -- of our single example
    data = {
        'label1': _int64List_feature(label1),
        'label2': _int64List_feature(label2),
        'label3': _int64List_feature(label3),
        'label4': _int64List_feature(label4),
        'label5': _int64List_feature(label5),
        # 'features' : _bytes_feature(serialize_array(music)),
        'features' : _float_feature(music),
        'track_id' : _int64_feature(track_id)
    }
    #create an Example, wrapping the single features
    out = tf.train.Example(features=tf.train.Features(feature=data))

    return out

In [63]:
def generate_tf_record(df,tf_path='val'):
    create_dir(tf_path)
    
    
    batch_size = 1024 * 50 # 50k records from each file batch
    count = 0
    total = math.ceil(len(df) / batch_size)

    for i in range(0, len(df), batch_size):
        batch_df = df[i:i+batch_size]
        
        tfrecords = [parse_single_music(data, labels) for data in batch_df.values]
        
        path = f"{tf_path}/{str(count).zfill(10)}.tfrecord"
        
        #with tf.python_io.TFRecordWriter(path) as writer:
        with tf.io.TFRecordWriter(path) as writer:
            for tfrecord in tfrecords:
                writer.write(tfrecord.SerializeToString())

        print(f"{count} {len(tfrecords)} {path}")
        count += 1
        print(f"{count}/{total} batchs / {count * batch_size} processed")

    print(f"{count}/{total} batchs / {len(df)} processed")
    
    return tf_path

    

In [64]:
tfrecord_path

'/mnt/disks/data/fma/trains/hierarchical_all/tfrecords'

In [65]:
val_path = generate_tf_record(df_val,tf_path=os.path.join(tfrecord_path,'val'))

0 1114 /mnt/disks/data/fma/trains/hierarchical_all/tfrecords/val/0000000000.tfrecord
1/1 batchs / 51200 processed
1/1 batchs / 1114 processed


In [66]:
test_path = generate_tf_record(df_test,tf_path=os.path.join(tfrecord_path,'test'))

0 1125 /mnt/disks/data/fma/trains/hierarchical_all/tfrecords/test/0000000000.tfrecord
1/1 batchs / 51200 processed
1/1 batchs / 1125 processed


In [67]:
train_path = generate_tf_record(df_train,tf_path=os.path.join(tfrecord_path,'train'))

0 51200 /mnt/disks/data/fma/trains/hierarchical_all/tfrecords/train/0000000000.tfrecord
1/2 batchs / 51200 processed
1 51031 /mnt/disks/data/fma/trains/hierarchical_all/tfrecords/train/0000000001.tfrecord
2/2 batchs / 102400 processed
2/2 batchs / 102231 processed


In [68]:
def create_metadata(metadata_path):

    with open(metadata_path, 'w+') as f:
        f.write(json.dumps({
            'sequence_size': args.sequence_size,
            'n_levels': labels_size,
            'labels_size': [labels['label1_count'],labels['label2_count'],
                           labels['label3_count'],labels['label4_count'],
                           labels['label5_count']],
            'val_path': val_path,
            'train_path': train_path,
            'test_path': test_path,
            'trainset_count': len(df_train),
            'validationset_count': len(df_val),
            'testset_count': len(df_test)
        }))

In [69]:
create_metadata(metadata_path)

In [70]:
job_path

'/mnt/disks/data/fma/trains/hierarchical_all'

In [71]:
df_tracks.to_csv(os.path.join(job_path,"tracks.csv"),index=False)

In [72]:
with open(categories_labels_path, 'r') as f:
    labels = json.loads(f.read())

In [130]:
levels_size = {'level1_size': labels['label1_count']-1,
        'level2_size': labels['label2_count']-1,
        'level3_size': labels['label3_count']-1,
        'level4_size': labels['label4_count']-1,
        'level5_size': labels['label5_count']-1}

In [131]:
levels_size['level1_size']

15