In [47]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import ast
import os
import csv
import math
from sklearn.utils import shuffle
from math import ceil

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

In [48]:
tqdm.pandas()

In [49]:

args = pd.Series({
    "root_dir":"/mnt/disks/data/",
    "dataset_path":"/mnt/disks/data/fma/fma_large",
    "embeddings":"music_style",
    "sequence_size": 1280,
    "train_id": "hierarchical_single",
    'sample_size': 1
})


In [50]:

job_path = "/mnt/disks/data/fma/trains"

# In[15]:

train_path = os.path.join(job_path,args.train_id)


tfrecord_path = os.path.join(train_path,"tfrecords")

# In[16]:

base_path = os.path.join(args.root_dir,"fma")

# In[17]:

models_path = os.path.join(args.root_dir,"models")


metadata_path_fma = os.path.join(base_path,"fma_metadata")

# In[18]:

metadata_path = os.path.join(train_path,"metadata.json")


categories_labels_path = os.path.join(train_path,"labels.json")


In [51]:


def __load_json__(path):
    with open(path, 'r') as f:
        tmp = json.loads(f.read())

    return tmp




In [52]:

def create_dir(path):
    # checking if the directory demo_folder2 
    # exist or not.
    if not os.path.isdir(path):

        # if the demo_folder2 directory is 
        # not present then create it.
        os.makedirs(path)
    return True



In [53]:
create_dir(train_path)

True

## Load genres file. Contains relationships beetwen genres

In [54]:
genres_df = pd.read_csv(os.path.join(metadata_path_fma,'genres.csv'))


In [55]:
genres_df

Unnamed: 0,genre_id,#tracks,parent,title,top_level
0,1,8693,38,Avant-Garde,38
1,2,5271,0,International,2
2,3,1752,0,Blues,3
3,4,4126,0,Jazz,4
4,5,4106,0,Classical,5
...,...,...,...,...,...
158,1032,60,102,Turkish,2
159,1060,30,46,Tango,2
160,1156,26,130,Fado,2
161,1193,72,763,Christmas,38


In [56]:
genres_df[genres_df['genre_id'] == 495]

Unnamed: 0,genre_id,#tracks,parent,title,top_level
136,495,2061,15,Downtempo,15


In [57]:
# Cria um dicionário que associa o ID de cada música aos IDs de seus gêneros musicais
tracks_df = pd.read_csv(os.path.join(metadata_path_fma,'tracks_valid.csv'))

In [58]:
tracks_df = tracks_df.sample(frac=args.sample_size)

In [59]:
tracks_df.sample(20)

Unnamed: 0,track_id,track_title,valid_genre
35663,54767,Night Stalking,"['38', '88', '113']"
5737,10783,Indra Merca,"['18', '224']"
84031,127451,I Want to See Her Now,"['1', '38']"
68888,107558,Lamplight,"['10', '66']"
489,726,Track 01,['32']
28011,43310,Pennies From Heaven,"['38', '362']"
56820,85675,In the Can,"['4', '12', '38']"
69608,108444,Nuthin Bout Tha Weather,['21']
103916,154956,peace flower,"['18', '42', '495']"
86755,131127,Rock Zombie,"['25', '85']"


In [60]:
tracks_df.valid_genre.values

array(["['7', '15', '297']", "['167']", "['27', '66', '76', '77']", ...,
       "['240']", "['15', '32', '38', '247']", "['38', '53', '70']"],
      dtype=object)

In [61]:
tracks_df.track_title

45118                                  Plan 9
9600                                Excerpt 1
31361                                Epilogue
19711                              La Batalla
1018                              White Walls
                         ...                 
4782                     The Hydrologic Cycle
17134                                Man Down
51213    10 Days for the Three Malarkey Twins
15665                                Aircraft
50551                                 21. vek
Name: track_title, Length: 104186, dtype: object

In [62]:
## Get complete genre structure
def get_all_structure(estrutura,df_genres):
    ## Get structure from df_genres
    def get_all_structure_from_df(estrutura,df_genres,structure=[]):
        if estrutura == 0:
            return structure
        else:
            structure.append(int(estrutura))
            get_all_structure_from_df(df_genres[df_genres["genre_id"]==int(estrutura)].parent.values[0],df_genres,structure)
            return structure
    
    return get_all_structure_from_df(estrutura,df_genres,structure=[])
    

In [63]:
# tracks_df['valid_genre'] = tracks_df.track_genres.apply(lambda x: x.strip('][').split(', ') if x != '[]' else None)
tracks_df['valid_genre'] = tracks_df.valid_genre.apply(lambda x: ast.literal_eval(x))

In [64]:
tracks_df['last_genre_id'] = tracks_df.valid_genre.apply(lambda x:x[-1] if x != None else None)

In [65]:
tracks_df.sample(20)

Unnamed: 0,track_id,track_title,valid_genre,last_genre_id
74564,114975,stormy castles (present),"[32, 38]",38
19687,31845,Shallow Grave,"[12, 85]",85
52457,79928,Nighttime in dormitory,"[1, 18, 38]",38
58115,87374,Motif Three,[42],42
8406,14392,Never All Ways (clean),[21],21
96601,144493,Pulse,"[15, 267, 1235]",1235
101704,151351,100 Bags,"[100, 130]",130
55421,83902,Tear Drops,"[1, 38, 186]",186
32172,49872,The Walker Volume One Part Two,"[1, 25, 32]",32
42485,64357,To The Dusk (Original 1994 4track Recording),[21],21


In [20]:
tracks_df.dropna(inplace=True)

In [66]:
tracks_df

Unnamed: 0,track_id,track_title,valid_genre,last_genre_id
45118,67521,Plan 9,"[7, 15, 297]",297
9600,16210,Excerpt 1,[167],167
31361,48487,Epilogue,"[27, 66, 76, 77]",77
19711,31869,La Batalla,"[10, 12, 504, 619]",619
1018,1297,White Walls,[12],12
...,...,...,...,...
4782,9361,The Hydrologic Cycle,"[32, 38]",38
17134,28173,Man Down,"[15, 296, 468]",468
51213,76456,10 Days for the Three Malarkey Twins,[240],240
15665,25720,Aircraft,"[15, 32, 38, 247]",247


In [67]:
tracks_df['full_genre_id'] = tracks_df.last_genre_id.progress_apply(lambda x: get_all_structure(x,genres_df)[::-1])

  0%|          | 0/104186 [00:00<?, ?it/s]

In [68]:
tracks_df.full_genre_id

45118         [15, 297]
9600      [12, 31, 167]
31361           [2, 77]
19711     [2, 130, 619]
1018               [12]
              ...      
4782               [38]
17134         [15, 468]
51213    [15, 297, 240]
15665         [38, 247]
50551          [12, 70]
Name: full_genre_id, Length: 104186, dtype: object

In [69]:
tracks_df.columns

Index(['track_id', 'track_title', 'valid_genre', 'last_genre_id',
       'full_genre_id'],
      dtype='object')

In [70]:
tracks_df = tracks_df[['track_id','full_genre_id']]

In [71]:
tracks_df.full_genre_id.values

array([list([15, 297]), list([12, 31, 167]), list([2, 77]), ...,
       list([15, 297, 240]), list([38, 247]), list([12, 70])],
      dtype=object)

In [72]:
tracks_df.full_genre_id.info

<bound method Series.info of 45118         [15, 297]
9600      [12, 31, 167]
31361           [2, 77]
19711     [2, 130, 619]
1018               [12]
              ...      
4782               [38]
17134         [15, 468]
51213    [15, 297, 240]
15665         [38, 247]
50551          [12, 70]
Name: full_genre_id, Length: 104186, dtype: object>

In [73]:
labels_size = tracks_df.full_genre_id.apply(lambda x: len(x))

In [74]:
labels_size = labels_size.max()

In [75]:
labels_size

5

### Parse of label to structure

In [76]:
### Function for parse label to sctructure of hierarhical scheme

def parse_label(label,label_size=5):
    # label = label.split('-')
    # preencher com 0 no caso de haver menos de 5 níveis
    labels = np.zeros(label_size,dtype=int)
    for i, label in enumerate(label):
        if i == 5:
            break
        # Aqui você pode fazer a conversão do label em um índice inteiro usando um dicionário ou outro método
        # Neste exemplo, estou apenas usando a posição da label na lista como índice
        labels[i] = label
    return labels

In [77]:
parsed_labels = tracks_df.full_genre_id.apply(lambda x: parse_label(x))

In [78]:
tracks_df['full_genre_id']

45118         [15, 297]
9600      [12, 31, 167]
31361           [2, 77]
19711     [2, 130, 619]
1018               [12]
              ...      
4782               [38]
17134         [15, 468]
51213    [15, 297, 240]
15665         [38, 247]
50551          [12, 70]
Name: full_genre_id, Length: 104186, dtype: object

In [79]:
def convert_label_to_string(x,level=2):
    return '-'.join([str(value) for value in x[:level]])

In [80]:
tracks_df['labels_1'] = parsed_labels.progress_apply(lambda x: str(x[:1][0]))
tracks_df['labels_2'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=2))
tracks_df['labels_3'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=3))
tracks_df['labels_4'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=4))
tracks_df['labels_5'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=5))

  0%|          | 0/104186 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_df['labels_1'] = parsed_labels.progress_apply(lambda x: str(x[:1][0]))


  0%|          | 0/104186 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_df['labels_2'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=2))


  0%|          | 0/104186 [00:00<?, ?it/s]

  0%|          | 0/104186 [00:00<?, ?it/s]

  0%|          | 0/104186 [00:00<?, ?it/s]

In [88]:
tracks_df = tracks_df[tracks_df['labels_1'].isin(["38","1235"])]

In [89]:
tracks_df['labels_2'].value_counts()

38-0        6508
1235-0      6043
1235-107    3988
38-250      3379
38-247      2749
38-41       1997
38-224      1422
38-32       1207
1235-18      982
38-47        841
38-456       829
38-125       703
38-514       686
38-6         519
38-186       428
38-1         313
38-30        254
38-22        231
1235-267     201
Name: labels_2, dtype: int64

In [90]:
tracks_df['labels_5'].value_counts()

38-0-0-0-0          6508
1235-0-0-0-0        6043
1235-107-0-0-0      3988
38-250-0-0-0        3379
38-247-0-0-0        2749
38-41-0-0-0         1997
38-224-0-0-0        1422
38-32-0-0-0         1207
38-47-0-0-0          841
38-456-0-0-0         829
38-125-0-0-0         703
38-514-0-0-0         686
1235-18-0-0-0        659
38-186-0-0-0         428
1235-18-538-0-0      323
38-1-0-0-0           313
38-30-0-0-0          254
38-22-0-0-0          231
1235-267-0-0-0       201
38-6-16-763-0        194
38-6-360-0-0         190
38-6-16-763-1193      72
38-6-0-0-0            63
Name: labels_5, dtype: int64

In [91]:
# tracks_df = tracks_df[tracks_df['labels_1'].isin(['3','14'])]

In [92]:
categories_df = pd.DataFrame({'level5':tracks_df.labels_5.unique()})

In [93]:
categories_df['level1'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:1]))
categories_df['level2'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:2]))
categories_df['level3'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:3]))
categories_df['level4'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:4]))

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

In [94]:
categories_df

Unnamed: 0,level5,level1,level2,level3,level4
0,38-456-0-0-0,38,38-456,38-456-0,38-456-0-0
1,38-186-0-0-0,38,38-186,38-186-0,38-186-0-0
2,38-41-0-0-0,38,38-41,38-41-0,38-41-0-0
3,38-250-0-0-0,38,38-250,38-250-0,38-250-0-0
4,38-32-0-0-0,38,38-32,38-32-0,38-32-0-0
5,38-514-0-0-0,38,38-514,38-514-0,38-514-0-0
6,1235-107-0-0-0,1235,1235-107,1235-107-0,1235-107-0-0
7,38-247-0-0-0,38,38-247,38-247-0,38-247-0-0
8,1235-0-0-0-0,1235,1235-0,1235-0-0,1235-0-0-0
9,38-125-0-0-0,38,38-125,38-125-0,38-125-0-0


In [95]:
def get_labels_name(x,genres_df):
    levels = 5
    full_name = []
    last_level = 0
    genre_root = ""
    for genre in x.split('-'):
        genre_df = genres_df[genres_df['genre_id'] == int(genre)]
        if genre_df.empty:
            genre_name = genre_root 
        else:
            genre_name = genre_df.title.values.tolist()[0]
            genre_root = genre_name
        
        full_name.append(genre_name)
    full_name = '>'.join(full_name)
        
    return full_name
    # return genres_df[genres_df['genre_id'] == int(x)].title.values.tolist()[0]

In [96]:
categories_df['level5_name'] = categories_df.level5.apply(lambda x: get_labels_name(x,genres_df))

In [97]:
categories_df

Unnamed: 0,level5,level1,level2,level3,level4,level5_name
0,38-456-0-0-0,38,38-456,38-456-0,38-456-0-0,Experimental>Minimalism>Minimalism>Minimalism>...
1,38-186-0-0-0,38,38-186,38-186-0,38-186-0-0,Experimental>Sound Poetry>Sound Poetry>Sound P...
2,38-41-0-0-0,38,38-41,38-41-0,38-41-0-0,Experimental>Electroacoustic>Electroacoustic>E...
3,38-250-0-0-0,38,38-250,38-250-0,38-250-0-0,Experimental>Improv>Improv>Improv>Improv
4,38-32-0-0-0,38,38-32,38-32-0,38-32-0-0,Experimental>Noise>Noise>Noise>Noise
5,38-514-0-0-0,38,38-514,38-514-0,38-514-0-0,Experimental>Sound Art>Sound Art>Sound Art>Sou...
6,1235-107-0-0-0,1235,1235-107,1235-107-0,1235-107-0-0,Instrumental>Ambient>Ambient>Ambient>Ambient
7,38-247-0-0-0,38,38-247,38-247-0,38-247-0-0,Experimental>Musique Concrete>Musique Concrete...
8,1235-0-0-0-0,1235,1235-0,1235-0-0,1235-0-0-0,Instrumental>Instrumental>Instrumental>Instrum...
9,38-125-0-0-0,38,38-125,38-125-0,38-125-0-0,Experimental>Unclassifiable>Unclassifiable>Unc...


In [98]:
def __create_labels__(categories_df):
    data = {
        "label1": {},
        "label2": {},
        "label3": {},
        "label4": {},
        "label5": {},
        "label1_inverse": [],
        "label2_inverse": [],
        "label3_inverse": [],
        "label4_inverse": [],
        "label5_inverse": [],
        "label1_name": {},
        "label2_name": {},
        "label3_name": {},
        "label4_name": {},
        "label5_name": {},
    }

    idx = 0
    
    for id_x, cat in enumerate(set(categories_df.level1.values.tolist())):
        data['label1'][cat] = idx
        data['label1_inverse'].append(cat)
        data['label1_count'] = idx + 1
        idx+=1

    for id_x, cat in enumerate(set(categories_df.level2.values.tolist())):
        data['label2'][cat] = idx
        data['label2_inverse'].append(cat)
        data['label2_count'] = idx + 1
        idx+=1
        
    for id_x, cat in enumerate(set(categories_df.level3.values.tolist())):
        data['label3'][cat] = idx
        data['label3_inverse'].append(cat)
        data['label3_count'] = idx + 1
        idx+=1

    for id_x, cat in enumerate(set(categories_df.level4.values.tolist())):
        data['label4'][cat] = idx
        data['label4_inverse'].append(cat)
        data['label4_count'] = idx + 1
        idx+=1
        
    for idx, cat in enumerate(set(categories_df.level5.values.tolist())):
        data['label5'][cat] = idx
        data['label5_inverse'].append(cat)
        data['label5_count'] = idx + 1
        idx+=1
        
    for cat5,cat1,cat2,cat3,cat4,name5 in categories_df.values:
        
        name1 = '>'.join(name5.split('>')[:1])
        name2 = '>'.join(name5.split('>')[:2])
        name3 = '>'.join(name5.split('>')[:3])
        name4 = '>'.join(name5.split('>')[:4])
        
        
        data['label1_name'][cat1] = name1
        data['label2_name'][cat2] = name2
        data['label3_name'][cat3] = name3
        data['label4_name'][cat4] = name4
        data['label5_name'][cat5] = name5
        
    return data

In [99]:
with open(categories_labels_path, 'w+') as f:
    f.write(json.dumps(__create_labels__(categories_df)))

In [100]:
labels  = __create_labels__(categories_df)

In [101]:
labels['label4']

{'1235-107-0-0': 43,
 '38-456-0-0': 44,
 '38-224-0-0': 45,
 '38-514-0-0': 46,
 '38-32-0-0': 47,
 '38-250-0-0': 48,
 '38-47-0-0': 49,
 '38-247-0-0': 50,
 '38-6-0-0': 51,
 '38-22-0-0': 52,
 '38-1-0-0': 53,
 '38-6-360-0': 54,
 '1235-267-0-0': 55,
 '38-125-0-0': 56,
 '1235-18-538-0': 57,
 '38-6-16-763': 58,
 '1235-18-0-0': 59,
 '38-30-0-0': 60,
 '38-186-0-0': 61,
 '1235-0-0-0': 62,
 '38-0-0-0': 63,
 '38-41-0-0': 64}

In [102]:
labels['label1_count']

2

In [103]:


def parse_tfr_element(element):
    #use the same structure as above; it's kinda an outline of the structure we now want to create
    data = {
        'emb' : tf.io.FixedLenFeature([], tf.string),
        'track_id' : tf.io.FixedLenFeature([], tf.int64),
    }
    
    content = tf.io.parse_single_example(element, data)

    track_id = content['track_id']
    emb = content['emb']
    

    #get our 'feature'-- our image -- and reshape it appropriately
    feature = tf.io.parse_tensor(emb, out_type=tf.float32)
    return (feature, track_id)


def get_dataset(filename):
    #create the dataset
    dataset = tf.data.TFRecordDataset(filename)

    #pass every single feature through our mapping function
    dataset = dataset.map(
        parse_tfr_element
    )

    return dataset




In [104]:
import numpy as np


def load_dataset(path,dataset=args.embeddings):
    tfrecords_path = os.path.join(path,'tfrecords',dataset)
    
    
    tfrecords_path = [os.path.join(tfrecords_path,path) for path in os.listdir(tfrecords_path)]
    dataset = get_dataset(tfrecords_path)
    
    df = pd.DataFrame(
        dataset.as_numpy_iterator(),
        columns=['feature', 'track_id']
    )
        
    df.dropna(inplace=True)
    
    
    try:
        df.feature = df.feature.apply(lambda x: x[0] if x.shape[0] != 0 else None)
    except:
        print(x)
    
    return df
    


In [105]:
tracks_df['labels_5']

34336       38-456-0-0-0
26287       38-186-0-0-0
46360        38-41-0-0-0
29287       38-250-0-0-0
101349       38-32-0-0-0
               ...      
62077       38-514-0-0-0
94876     1235-107-0-0-0
60181     1235-107-0-0-0
4782          38-0-0-0-0
15665       38-247-0-0-0
Name: labels_5, Length: 33280, dtype: object

In [106]:
def __split_data__(group, percentage=0.1):
    if len(group) == 1:
        return group, group

    shuffled = shuffle(group.values)
    finish_test = int(ceil(len(group) * percentage))

    first = pd.DataFrame(shuffled[:finish_test], columns=group.columns)
    second = pd.DataFrame(shuffled[finish_test:], columns=group.columns)

    return first, second

In [107]:
def select_dataset(tracks_df):
    
#     dataset_testset_path = os.path.join(tfrecord_path,'test')
#     dataset_validationset_path = os.path.join(tfrecord_path,'val')
#     dataset_trainset_path = os.path.join(tfrecord_path,'train')
    
    df = load_dataset(args.dataset_path,dataset=args.embeddings)
    
    df.dropna(inplace=True)
    
    tracks_df = tracks_df.merge(df, on='track_id')
    
    tracks_df.loc[:,'labels_1'] = tracks_df.labels_1.astype(str).progress_apply(lambda x: labels['label1'][x])
    tracks_df.loc[:,'labels_2'] = tracks_df.labels_2.astype(str).progress_apply(lambda x: labels['label2'][x])
    tracks_df.loc[:,'labels_3'] = tracks_df.labels_3.astype(str).progress_apply(lambda x: labels['label3'][x])
    tracks_df.loc[:,'labels_4'] = tracks_df.labels_4.astype(str).progress_apply(lambda x: labels['label4'][x])
    tracks_df.loc[:,'labels_5'] = tracks_df.labels_5.astype(str).progress_apply(lambda x: labels['label5'][x])
    
    tests = []
    trains = []
    validations = []
    groups = tracks_df.groupby("labels_5")
    
    
    count = 0
    items_count = 0
    total = len(groups)
    total_items = len(tracks_df)
    oversampling_size = 30  # int(group_sizes.mean() + group_sizes.std() * 2)
    print(f"oversampling_size: {oversampling_size}")

    for code, group in groups:
        test, train_to_split = __split_data__(group, 0.01)  # 10%
        train_to_split = train_to_split
        validation, train = __split_data__(train_to_split, 0.01)  # %1

        tests.append(test)
        validations.append(validation)

        ## this increase the numner of samples when classes has low quantity
        count_train = len(train)
        if count_train < oversampling_size:
            train = train.sample(oversampling_size, replace=True)

        trains.append(train)

        count += 1
        items_count += count_train
        
        
    df_test = pd.concat(tests, sort=False).sample(frac=1).reset_index(drop=True)
    # .to_csv(dataset_testset_path, index=False,quoting=csv.QUOTE_ALL)
    df_val = pd.concat(validations, sort=False).sample(frac=1).reset_index(drop=True)
    df_train = pd.concat(trains, sort=False).sample(frac=1).reset_index(drop=True)

    return df_train,df_test,df_val

In [108]:
df_train,df_test,df_val = select_dataset(tracks_df)

2023-05-12 15:16:31.605189: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 713 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:65:00.0, compute capability: 7.5
2023-05-12 15:16:32.299523: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [11]
	 [[{{node Placeholder/_0}}]]


  0%|          | 0/33278 [00:00<?, ?it/s]

  tracks_df.loc[:,'labels_1'] = tracks_df.labels_1.astype(str).progress_apply(lambda x: labels['label1'][x])


  0%|          | 0/33278 [00:00<?, ?it/s]

  tracks_df.loc[:,'labels_2'] = tracks_df.labels_2.astype(str).progress_apply(lambda x: labels['label2'][x])


  0%|          | 0/33278 [00:00<?, ?it/s]

  tracks_df.loc[:,'labels_3'] = tracks_df.labels_3.astype(str).progress_apply(lambda x: labels['label3'][x])


  0%|          | 0/33278 [00:00<?, ?it/s]

  tracks_df.loc[:,'labels_4'] = tracks_df.labels_4.astype(str).progress_apply(lambda x: labels['label4'][x])


  0%|          | 0/33278 [00:00<?, ?it/s]

  tracks_df.loc[:,'labels_5'] = tracks_df.labels_5.astype(str).progress_apply(lambda x: labels['label5'][x])


oversampling_size: 30


In [109]:
df_train

Unnamed: 0,track_id,full_genre_id,labels_1,labels_2,labels_3,labels_4,labels_5,feature
0,154634,"[38, 47]",1,4,23,49,11,"[0.029363057, 0.014465262, 0.0016358495, 0.149..."
1,49886,"[38, 250]",1,6,22,48,5,"[0.13567193, -0.019791266, -0.011914919, -0.00..."
2,2103,"[38, 22]",1,19,28,52,20,"[0.034534056, 0.006208837, 0.016459614, -0.018..."
3,38200,[38],1,16,36,63,0,"[-0.037430465, -0.03674521, 0.060465634, -0.06..."
4,107269,"[38, 250]",1,6,22,48,5,"[0.1401887, 0.104203396, -0.00028573474, 0.016..."
...,...,...,...,...,...,...,...,...
32588,135831,[1235],0,17,24,62,3,"[-0.015580535, 0.086062945, -0.0069233277, 0.0..."
32589,153139,[38],1,16,36,63,0,"[9.556611e-06, 0.048067074, -0.047501296, 0.16..."
32590,84453,"[38, 47]",1,4,23,49,11,"[0.14999753, -0.013276826, -0.010032217, -0.00..."
32591,92952,"[1235, 107]",0,20,26,43,15,"[0.030459225, -0.028177211, 0.13664915, 0.0629..."


In [110]:

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a floast_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
  array = tf.io.serialize_tensor(array)
  return array


In [111]:
def parse_single_music(data,labels):
    # cat1, cat2, cat3, cat4, cat5 = data
    track_id, _, cat1, cat2, cat3, cat4, cat5, music = data
    
    label1 = np.array([cat1, labels['label1_count']], np.int64)
    label2 = np.array([cat2, labels['label2_count']], np.int64)
    label3 = np.array([cat3, labels['label3_count']], np.int64)
    label4 = np.array([cat4, labels['label4_count']], np.int64)
    label5 = np.array([cat5, labels['label5_count']], np.int64)
    
    
    #define the dictionary -- the structure -- of our single example
    data = {
        'label1': tf.train.Feature(int64_list=tf.train.Int64List(value=label1)),
        'label2': tf.train.Feature(int64_list=tf.train.Int64List(value=label2)),
        'label3': tf.train.Feature(int64_list=tf.train.Int64List(value=label3)),
        'label4': tf.train.Feature(int64_list=tf.train.Int64List(value=label4)),
        'emb' : _bytes_feature(serialize_array(music)),
        'track_id' : _int64_feature(track_id)
    }
    #create an Example, wrapping the single features
    out = tf.train.Example(features=tf.train.Features(feature=data))

    return out

In [112]:
def generate_tf_record(df,tf_path='tfrecords/val'):
    create_dir(tf_path)
    
    
    batch_size = 1024 * 50 # 50k records from each file batch
    count = 0
    total = math.ceil(len(df) / batch_size)

    for i in range(0, len(df), batch_size):
        batch_df = df[i:i+batch_size]
        
        tfrecords = [parse_single_music(data, labels) for data in batch_df.values]
        
        path = f"{tf_path}/{str(count).zfill(10)}.tfrecord"
        
        #with tf.python_io.TFRecordWriter(path) as writer:
        with tf.io.TFRecordWriter(path) as writer:
            for tfrecord in tfrecords:
                writer.write(tfrecord.SerializeToString())

        print(f"{count} {len(tfrecords)} {path}")
        count += 1
        print(f"{count}/{total} batchs / {count * batch_size} processed")

    print(f"{count}/{total} batchs / {len(df)} processed")

    

In [113]:
generate_tf_record(df_val,tf_path=os.path.join(tfrecord_path,'val'))

0 340 /mnt/disks/data/fma/trains/hierarchical_single/tfrecords/val/0000000000.tfrecord
1/1 batchs / 51200 processed
1/1 batchs / 340 processed


In [114]:
generate_tf_record(df_test,tf_path=os.path.join(tfrecord_path,'test'))

0 345 /mnt/disks/data/fma/trains/hierarchical_single/tfrecords/test/0000000000.tfrecord
1/1 batchs / 51200 processed
1/1 batchs / 345 processed


In [115]:
generate_tf_record(df_train,tf_path=os.path.join(tfrecord_path,'train'))

0 32593 /mnt/disks/data/fma/trains/hierarchical_single/tfrecords/train/0000000000.tfrecord
1/1 batchs / 51200 processed
1/1 batchs / 32593 processed


In [119]:
def create_metadata(metadata_path):

    with open(metadata_path, 'w+') as f:
        f.write(json.dumps({
            'sequence_size': args.sequence_size,
            'n_levels': labels_size,
            'labels_size': [labels['label1_count'],labels['label2_count'],
                           labels['label3_count'],labels['label4_count'],
                           labels['label5_count']],
            'trainset_count':len(df_train),
            'validationset_count':len(df_val),
            'testset_count': len(df_test)
        }))

In [122]:
create_metadata(metadata_path)

In [120]:
tracks_df.to_csv(os.path.join(train_path,"tracks.csv"),index=False)