In [60]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import ast
import os
import csv
import math
from sklearn.utils import shuffle
from math import ceil
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

In [61]:
tqdm.pandas()

In [62]:

args = pd.Series({
    "root_dir":"/mnt/disks/data/",
    "dataset_path":"/mnt/disks/data/fma/fma_large",
    "embeddings":"music_style",
    "sequence_size": 1280,
    "train_id": "hierarchical_all",
    'sample_size': 1
})


In [63]:

base_path = "/mnt/disks/data/fma/trains"


job_path = os.path.join(base_path,args.train_id)


tfrecord_path = os.path.join(job_path,"tfrecords")

# In[16]:

base_path = os.path.join(args.root_dir,"fma")

# In[17]:

models_path = os.path.join(args.root_dir,"models")


metadata_path_fma = os.path.join(base_path,"fma_metadata")

# In[18]:

metadata_path = os.path.join(job_path,"metadata.json")


categories_labels_path = os.path.join(job_path,"labels.json")


In [64]:


def __load_json__(path):
    with open(path, 'r') as f:
        tmp = json.loads(f.read())

    return tmp




In [65]:

def create_dir(path):
    # checking if the directory demo_folder2 
    # exist or not.
    if not os.path.isdir(path):

        # if the demo_folder2 directory is 
        # not present then create it.
        os.makedirs(path)
    return True



In [66]:
import shutil
shutil.rmtree(job_path)

In [67]:
create_dir(job_path)

True

## Load genres file. Contains relationships beetwen genres

In [68]:
genres_df = pd.read_csv(os.path.join(metadata_path_fma,'genres.csv'))


In [69]:
genres_df

Unnamed: 0,genre_id,#tracks,parent,title,top_level
0,1,8693,38,Avant-Garde,38
1,2,5271,0,International,2
2,3,1752,0,Blues,3
3,4,4126,0,Jazz,4
4,5,4106,0,Classical,5
...,...,...,...,...,...
158,1032,60,102,Turkish,2
159,1060,30,46,Tango,2
160,1156,26,130,Fado,2
161,1193,72,763,Christmas,38


In [70]:
genres_df[genres_df['genre_id'] == 495]

Unnamed: 0,genre_id,#tracks,parent,title,top_level
136,495,2061,15,Downtempo,15


In [71]:
# Cria um dicionário que associa o ID de cada música aos IDs de seus gêneros musicais
tracks_df = pd.read_csv(os.path.join(metadata_path_fma,'tracks_valid.csv'))

In [72]:
tracks_df = tracks_df.sample(frac=args.sample_size)

In [73]:
tracks_df.sample(20)

Unnamed: 0,track_id,track_title,valid_genre
46414,69720,Tu vas crevette,"['88', '90']"
34686,53367,Easy Ride,['100']
82343,125112,Good-Night,['8']
18731,30361,Make Your Breasts Kiss,"['15', '38', '41', '297']"
73278,113276,Sweet,['2']
75650,116338,Morcore,"['15', '236', '296']"
10289,17174,Installation With Violins Inserted Between The...,"['1', '38']"
40246,61166,EasSCoasT WessCoasT,"['21', '539']"
55891,84545,re3(re:construction),"['32', '38']"
87625,132215,Kad nam nestane ljudi,['15']


In [74]:
tracks_df.valid_genre.values

array(["['27', '66']", "['5']", "['41', '456', '659']", ..., "['85']",
       "['12', '25']", "['15', '38']"], dtype=object)

In [75]:
tracks_df.track_title

89145                         So Unreal
42621                         Interview
53612                             Twill
47827                 Love Your Friends
50537                     Srpski pasulj
                      ...              
59511                      You Are Dead
83472           Happy Harlem (explicit)
9694               You Know That I Will
35794                   Your Mind Moves
79041    Live at RDA69 27/02/2015 part1
Name: track_title, Length: 104186, dtype: object

In [76]:
## Get complete genre structure
def get_all_structure(estrutura,df_genres):
    ## Get structure from df_genres
    def get_all_structure_from_df(estrutura,df_genres,structure=[]):
        if estrutura == 0:
            return structure
        else:
            structure.append(int(estrutura))
            get_all_structure_from_df(df_genres[df_genres["genre_id"]==int(estrutura)].parent.values[0],df_genres,structure)
            return structure
    
    return get_all_structure_from_df(estrutura,df_genres,structure=[])
    

In [77]:
# tracks_df['valid_genre'] = tracks_df.track_genres.apply(lambda x: x.strip('][').split(', ') if x != '[]' else None)
tracks_df['valid_genre'] = tracks_df.valid_genre.apply(lambda x: ast.literal_eval(x))

In [78]:
tracks_df['last_genre_id'] = tracks_df.valid_genre.apply(lambda x:x[-1] if x != None else None)

In [79]:
tracks_df.sample(20)

Unnamed: 0,track_id,track_title,valid_genre,last_genre_id
22033,35198,It's Now or Never (Elvis Presley),"[1, 38, 125]",125
61206,93933,April,"[184, 400]",400
54750,83020,Kill Me,"[10, 25, 109]",109
63710,97653,Arrington and Gombloh Take 3,"[2, 74, 250]",250
68039,106493,Roadways,"[15, 107, 495]",495
69135,107873,Another world,"[17, 25, 53]",53
46749,70183,Lockheed,"[15, 38, 42, 70, 183, 236]",236
20714,33191,Big Hunk of Love,[12],12
50938,76124,deconstrucción,"[107, 659]",659
12646,20916,"no hug, no eyecontact, but a hand breaking shake","[1, 27, 36, 76, 125, 186]",186


In [80]:
tracks_df.dropna(inplace=True)

In [81]:
tracks_df

Unnamed: 0,track_id,track_title,valid_genre,last_genre_id
89145,134034,So Unreal,"[27, 66]",66
42621,64503,Interview,[5],5
53612,81635,Twill,"[41, 456, 659]",659
47827,71633,Love Your Friends,[25],25
50537,75527,Srpski pasulj,"[25, 66, 89]",89
...,...,...,...,...
59511,91032,You Are Dead,"[15, 70]",70
83472,126782,Happy Harlem (explicit),[17],17
9694,16321,You Know That I Will,[85],85
35794,54952,Your Mind Moves,"[12, 25]",25


In [82]:
tracks_df['full_genre_id'] = tracks_df.last_genre_id.progress_apply(lambda x: get_all_structure(x,genres_df)[::-1])

  0%|          | 0/104185 [00:00<?, ?it/s]

In [83]:
tracks_df.full_genre_id

89145        [12, 66]
42621             [5]
53612        [5, 659]
47827        [12, 25]
50537    [12, 25, 89]
             ...     
59511        [12, 70]
83472            [17]
9694         [12, 85]
35794        [12, 25]
79041            [38]
Name: full_genre_id, Length: 104185, dtype: object

In [84]:
tracks_df.columns

Index(['track_id', 'track_title', 'valid_genre', 'last_genre_id',
       'full_genre_id'],
      dtype='object')

In [85]:
tracks_df = tracks_df[['track_id','full_genre_id']]

In [86]:
tracks_df.full_genre_id.values

array([list([12, 66]), list([5]), list([5, 659]), ..., list([12, 85]),
       list([12, 25]), list([38])], dtype=object)

In [87]:
tracks_df.full_genre_id.info

<bound method Series.info of 89145        [12, 66]
42621             [5]
53612        [5, 659]
47827        [12, 25]
50537    [12, 25, 89]
             ...     
59511        [12, 70]
83472            [17]
9694         [12, 85]
35794        [12, 25]
79041            [38]
Name: full_genre_id, Length: 104185, dtype: object>

In [88]:
labels_size = tracks_df.full_genre_id.apply(lambda x: len(x))

In [89]:
labels_size = int(labels_size.max())

In [90]:
type(labels_size)

int

### Parse of label to structure

In [91]:
### Function for parse label to sctructure of hierarhical scheme

def parse_label(label,label_size=5):
    # label = label.split('-')
    # preencher com 0 no caso de haver menos de 5 níveis
    labels = np.zeros(label_size,dtype=int)
    for i, label in enumerate(label):
        if i == 5:
            break
        # Aqui você pode fazer a conversão do label em um índice inteiro usando um dicionário ou outro método
        # Neste exemplo, estou apenas usando a posição da label na lista como índice
        labels[i] = label
    return labels

In [92]:
parsed_labels = tracks_df.full_genre_id.apply(lambda x: parse_label(x))

In [93]:
tracks_df['full_genre_id']

89145        [12, 66]
42621             [5]
53612        [5, 659]
47827        [12, 25]
50537    [12, 25, 89]
             ...     
59511        [12, 70]
83472            [17]
9694         [12, 85]
35794        [12, 25]
79041            [38]
Name: full_genre_id, Length: 104185, dtype: object

In [94]:
def convert_label_to_string(x,level=2):
    return '-'.join([str(value) for value in x[:level]])

In [95]:
tracks_df['labels_1'] = parsed_labels.progress_apply(lambda x: str(x[:1][0]))
tracks_df['labels_2'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=2))
tracks_df['labels_3'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=3))
tracks_df['labels_4'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=4))
tracks_df['labels_5'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=5))

  0%|          | 0/104185 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_df['labels_1'] = parsed_labels.progress_apply(lambda x: str(x[:1][0]))


  0%|          | 0/104185 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_df['labels_2'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=2))


  0%|          | 0/104185 [00:00<?, ?it/s]

  0%|          | 0/104185 [00:00<?, ?it/s]

  0%|          | 0/104185 [00:00<?, ?it/s]

In [96]:
# tracks_df = tracks_df[tracks_df['labels_1'].isin(["38","1235"])]

In [97]:
tracks_df['labels_2'].value_counts()

labels_2
38-0        6508
1235-0      6043
12-25       5706
10-76       4126
1235-107    3988
            ... 
13-0          17
20-374         9
2-117          8
20-7           2
5-444          2
Name: count, Length: 121, dtype: int64

In [98]:
tracks_df['labels_5'].value_counts()

labels_5
38-0-0-0-0        6508
1235-0-0-0-0      6043
10-76-0-0-0       4126
1235-107-0-0-0    3988
17-103-0-0-0      3482
                  ... 
9-651-493-0-0        4
20-65-189-0-0        4
2-86-173-0-0         4
20-7-0-0-0           2
5-444-0-0-0          2
Name: count, Length: 159, dtype: int64

In [99]:
# tracks_df = tracks_df[tracks_df['labels_1'].isin(['3','14'])]

In [100]:
categories_df = pd.DataFrame({'level5':tracks_df.labels_5.unique()})

In [101]:
categories_df['level1'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:1]))
categories_df['level2'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:2]))
categories_df['level3'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:3]))
categories_df['level4'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:4]))

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

In [102]:
categories_df

Unnamed: 0,level5,level1,level2,level3,level4
0,12-66-0-0-0,12,12-66,12-66-0,12-66-0-0
1,5-0-0-0-0,5,5-0,5-0-0,5-0-0-0
2,5-659-0-0-0,5,5-659,5-659-0,5-659-0-0
3,12-25-0-0-0,12,12-25,12-25-0,12-25-0-0
4,12-25-89-0-0,12,12-25,12-25-89,12-25-89-0
...,...,...,...,...,...
154,20-65-189-0-0,20,20-65,20-65-189,20-65-189-0
155,2-86-173-0-0,2,2-86,2-86-173,2-86-173-0
156,20-7-0-0-0,20,20-7,20-7-0,20-7-0-0
157,2-46-808-0-0,2,2-46,2-46-808,2-46-808-0


In [103]:
def get_labels_name(x,genres_df):
    levels = 5
    full_name = []
    last_level = 0
    genre_root = ""
    for genre in x.split('-'):
        genre_df = genres_df[genres_df['genre_id'] == int(genre)]
        if genre_df.empty:
            genre_name = genre_root 
        else:
            genre_name = genre_df.title.values.tolist()[0]
            genre_root = genre_name
        
        full_name.append(genre_name)
    full_name = '>'.join(full_name)
        
    return full_name
    # return genres_df[genres_df['genre_id'] == int(x)].title.values.tolist()[0]

In [104]:
categories_df['level5_name'] = categories_df.level5.apply(lambda x: get_labels_name(x,genres_df))

In [105]:
categories_df

Unnamed: 0,level5,level1,level2,level3,level4,level5_name
0,12-66-0-0-0,12,12-66,12-66-0,12-66-0-0,Rock>Indie-Rock>Indie-Rock>Indie-Rock>Indie-Rock
1,5-0-0-0-0,5,5-0,5-0-0,5-0-0-0,Classical>Classical>Classical>Classical>Classical
2,5-659-0-0-0,5,5-659,5-659-0,5-659-0-0,Classical>Contemporary Classical>Contemporary ...
3,12-25-0-0-0,12,12-25,12-25-0,12-25-0-0,Rock>Punk>Punk>Punk>Punk
4,12-25-89-0-0,12,12-25,12-25-89,12-25-89-0,Rock>Punk>Post-Punk>Post-Punk>Post-Punk
...,...,...,...,...,...,...
154,20-65-189-0-0,20,20-65,20-65-189,20-65-189-0,Spoken>Radio>Talk Radio>Talk Radio>Talk Radio
155,2-86-173-0-0,2,2-86,2-86-173,2-86-173-0,International>Indian>N. Indian Traditional>N. ...
156,20-7-0-0-0,20,20-7,20-7-0,20-7-0-0,Spoken>Comedy>Comedy>Comedy>Comedy
157,2-46-808-0-0,2,2-46,2-46-808,2-46-808-0,International>Latin America>Salsa>Salsa>Salsa


In [106]:
def __create_labels__(categories_df):
    data = {
        "label1": {},
        "label2": {},
        "label3": {},
        "label4": {},
        "label5": {},
        "label1_inverse": [],
        "label2_inverse": [],
        "label3_inverse": [],
        "label4_inverse": [],
        "label5_inverse": [],
        "label1_name": {},
        "label2_name": {},
        "label3_name": {},
        "label4_name": {},
        "label5_name": {},
    }

    idx = 0
    
    for id_x, cat in enumerate(set(categories_df.level1.values.tolist())):
        data['label1'][cat] = idx
        data['label1_inverse'].append(cat)
        data['label1_count'] = idx + 1
        idx+=1

    for id_x, cat in enumerate(set(categories_df.level2.values.tolist())):
        data['label2'][cat] = idx
        data['label2_inverse'].append(cat)
        data['label2_count'] = idx + 1
        idx+=1
        
    for id_x, cat in enumerate(set(categories_df.level3.values.tolist())):
        data['label3'][cat] = idx
        data['label3_inverse'].append(cat)
        data['label3_count'] = idx + 1
        idx+=1

    for id_x, cat in enumerate(set(categories_df.level4.values.tolist())):
        data['label4'][cat] = idx
        data['label4_inverse'].append(cat)
        data['label4_count'] = idx + 1
        idx+=1
        
    for idx, cat in enumerate(set(categories_df.level5.values.tolist())):
        data['label5'][cat] = idx
        data['label5_inverse'].append(cat)
        data['label5_count'] = idx + 1
        idx+=1
        
    for cat5,cat1,cat2,cat3,cat4,name5 in categories_df.values:
        
        name1 = '>'.join(name5.split('>')[:1])
        name2 = '>'.join(name5.split('>')[:2])
        name3 = '>'.join(name5.split('>')[:3])
        name4 = '>'.join(name5.split('>')[:4])
        
        
        data['label1_name'][cat1] = name1
        data['label2_name'][cat2] = name2
        data['label3_name'][cat3] = name3
        data['label4_name'][cat4] = name4
        data['label5_name'][cat5] = name5
        
    return data

In [107]:
with open(categories_labels_path, 'w+') as f:
    f.write(json.dumps(__create_labels__(categories_df)))

In [108]:
labels  = __create_labels__(categories_df)

In [109]:
labels['label5']

{'12-26-113-0-0': 0,
 '15-468-0-0-0': 1,
 '12-26-0-0-0': 2,
 '12-25-64-0-0': 3,
 '4-37-0-0-0': 4,
 '15-337-0-0-0': 5,
 '2-46-502-0-0': 6,
 '2-172-0-0-0': 7,
 '38-514-0-0-0': 8,
 '2-86-0-0-0': 9,
 '15-184-0-0-0': 10,
 '12-440-0-0-0': 11,
 '15-296-0-0-0': 12,
 '12-31-167-0-0': 13,
 '20-7-0-0-0': 14,
 '9-63-0-0-0': 15,
 '20-0-0-0-0': 16,
 '20-378-0-0-0': 17,
 '12-31-101-0-0': 18,
 '2-46-0-0-0': 19,
 '15-0-0-0-0': 20,
 '2-176-0-0-0': 21,
 '15-695-0-0-0': 22,
 '5-322-0-0-0': 23,
 '5-441-0-0-0': 24,
 '20-65-0-0-0': 25,
 '38-0-0-0-0': 26,
 '38-41-0-0-0': 27,
 '13-311-0-0-0': 28,
 '2-86-173-0-0': 29,
 '38-125-0-0-0': 30,
 '3-0-0-0-0': 31,
 '4-97-0-0-0': 32,
 '12-36-0-0-0': 33,
 '1235-18-538-0-0': 34,
 '14-11-0-0-0': 35,
 '20-465-0-0-0': 36,
 '38-247-0-0-0': 37,
 '15-181-401-0-0': 38,
 '1235-0-0-0-0': 39,
 '20-138-0-0-0': 40,
 '12-25-0-0-0': 41,
 '38-456-0-0-0': 42,
 '2-118-0-0-0': 43,
 '17-103-0-0-0': 44,
 '12-314-0-0-0': 45,
 '17-180-0-0-0': 46,
 '14-0-0-0-0': 47,
 '5-443-0-0-0': 48,
 '21-693

In [110]:
labels['label1_count']

16

In [111]:


def parse_tfr_element(element):
    #use the same structure as above; it's kinda an outline of the structure we now want to create
    data = {
        'emb' : tf.io.FixedLenFeature([], tf.string),
        'track_id' : tf.io.FixedLenFeature([], tf.int64),
    }
    
    content = tf.io.parse_single_example(element, data)

    track_id = content['track_id']
    emb = content['emb']
    

    #get our 'feature'-- our image -- and reshape it appropriately
    feature = tf.io.parse_tensor(emb, out_type=tf.float32)
    return (feature, track_id)


def get_dataset(filename):
    #create the dataset
    dataset = tf.data.TFRecordDataset(filename)

    #pass every single feature through our mapping function
    dataset = dataset.map(
        parse_tfr_element
    )

    return dataset




In [112]:
import numpy as np


def load_dataset(path,dataset=args.embeddings):
    tfrecords_path = os.path.join(path,'tfrecords',dataset)
    
    
    tfrecords_path = [os.path.join(tfrecords_path,path) for path in os.listdir(tfrecords_path)]
    dataset = get_dataset(tfrecords_path)
    
    df = pd.DataFrame(
        dataset.as_numpy_iterator(),
        columns=['feature', 'track_id']
    )
        
    df.dropna(inplace=True)
    
    
    try:
        df.feature = df.feature.apply(lambda x: x[0] if x.shape[0] != 0 else None)
    except:
        print(x)
    
    return df
    


In [113]:
tracks_df['labels_5'].value_counts()

labels_5
38-0-0-0-0        6508
1235-0-0-0-0      6043
10-76-0-0-0       4126
1235-107-0-0-0    3988
17-103-0-0-0      3482
                  ... 
9-651-493-0-0        4
20-65-189-0-0        4
2-86-173-0-0         4
20-7-0-0-0           2
5-444-0-0-0          2
Name: count, Length: 159, dtype: int64

In [114]:
def __split_data__(group, percentage=0.1):
    if len(group) == 1:
        return group, group

    shuffled = shuffle(group.values)
    finish_test = int(ceil(len(group) * percentage))

    first = pd.DataFrame(shuffled[:finish_test], columns=group.columns)
    second = pd.DataFrame(shuffled[finish_test:], columns=group.columns)

    return first, second

In [115]:
def select_dataset(tracks_df):
    
#     dataset_testset_path = os.path.join(tfrecord_path,'test')
#     dataset_validationset_path = os.path.join(tfrecord_path,'val')
#     dataset_trainset_path = os.path.join(tfrecord_path,'train')
    
    df = load_dataset(args.dataset_path,dataset=args.embeddings)
    
    df.dropna(inplace=True)
    
    tracks_df = tracks_df.merge(df, on='track_id')
    
    tracks_df.loc[:,'labels_1'] = tracks_df.labels_1.astype(str).progress_apply(lambda x: labels['label1'][x])
    tracks_df.loc[:,'labels_2'] = tracks_df.labels_2.astype(str).progress_apply(lambda x: labels['label2'][x])
    tracks_df.loc[:,'labels_3'] = tracks_df.labels_3.astype(str).progress_apply(lambda x: labels['label3'][x])
    tracks_df.loc[:,'labels_4'] = tracks_df.labels_4.astype(str).progress_apply(lambda x: labels['label4'][x])
    tracks_df.loc[:,'labels_5'] = tracks_df.labels_5.astype(str).progress_apply(lambda x: labels['label5'][x])
    
    tests = []
    trains = []
    validations = []
    groups = tracks_df.groupby("labels_5")
    
    
    count = 0
    items_count = 0
    total = len(groups)
    total_items = len(tracks_df)
    oversampling_size = 30  # int(group_sizes.mean() + group_sizes.std() * 2)
    print(f"oversampling_size: {oversampling_size}")

    for code, group in groups:
        test, train_to_split = __split_data__(group, 0.01)  # 10%
        train_to_split = train_to_split
        validation, train = __split_data__(train_to_split, 0.01)  # %1

        tests.append(test)
        validations.append(validation)

        ## this increase the numner of samples when classes has low quantity
        count_train = len(train)
        if count_train < oversampling_size:
            train = train.sample(oversampling_size, replace=True)

        trains.append(train)

        count += 1
        items_count += count_train
        
        
    df_test = pd.concat(tests, sort=False).sample(frac=1).reset_index(drop=True)
    # .to_csv(dataset_testset_path, index=False,quoting=csv.QUOTE_ALL)
    df_val = pd.concat(validations, sort=False).sample(frac=1).reset_index(drop=True)
    df_train = pd.concat(trains, sort=False).sample(frac=1).reset_index(drop=True)

    return df_train,df_test,df_val

In [116]:
df_train,df_test,df_val = select_dataset(tracks_df)

2023-06-26 14:39:27.056356: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [11]
	 [[{{node Placeholder/_0}}]]


  0%|          | 0/104169 [00:00<?, ?it/s]

  0%|          | 0/104169 [00:00<?, ?it/s]

  0%|          | 0/104169 [00:00<?, ?it/s]

  0%|          | 0/104169 [00:00<?, ?it/s]

  0%|          | 0/104169 [00:00<?, ?it/s]

oversampling_size: 30


In [117]:
df_train

Unnamed: 0,track_id,full_genre_id,labels_1,labels_2,labels_3,labels_4,labels_5,feature
0,98203,"[15, 296]",8,40,220,347,12,"[-0.023319623, -0.00013373296, 0.19044419, -0...."
1,106253,[1235],7,103,276,334,39,"[-0.060997088, 0.05078725, 0.07981362, -0.0090..."
2,31634,[15],8,47,190,431,20,"[-0.009909769, 0.017467469, 0.05052768, 0.0799..."
3,123894,"[17, 103]",3,91,189,337,44,"[0.00019638737, 0.0022907455, 0.01411589, -0.0..."
4,154601,[1235],7,103,276,334,39,"[-0.03563307, -0.02632151, -0.021588087, -0.05..."
...,...,...,...,...,...,...,...,...
102226,118740,[17],3,45,173,430,155,"[0.006317248, -0.031279784, -0.06236591, -0.03..."
102227,61896,[8],0,130,229,405,117,"[0.026928047, -0.0065342286, 0.00997134, 0.010..."
102228,109610,"[1235, 107]",7,97,202,366,84,"[0.13790783, -0.0047743022, 0.026542058, 0.143..."
102229,128257,[15],8,47,190,431,20,"[0.017631581, 0.011740317, 0.04893403, -0.0273..."


In [118]:
def _bytes_feature(value):
  ### Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

def _float_feature(value):
  ### Returns a floast_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64List_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _int64_feature(value):
  ###  Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
    array = tf.io.serialize_tensor(array)
    return array

In [119]:
def parse_single_music(data,labels):
    track_id, _, cat1, cat2, cat3, cat4, cat5, music = data
    
    
    label1 = np.array([cat1, labels['label1_count']], np.int64)
    label2 = np.array([cat2, labels['label2_count']], np.int64)
    label3 = np.array([cat3, labels['label3_count']], np.int64)
    label4 = np.array([cat4, labels['label4_count']], np.int64)
    label5 = np.array([cat5, labels['label5_count']], np.int64)
    
    
    
    #define the dictionary -- the structure -- of our single example
    data = {
        'label1': _int64List_feature(label1),
        'label2': _int64List_feature(label2),
        'label3': _int64List_feature(label3),
        'label4': _int64List_feature(label4),
        'label5': _int64List_feature(label5),
        # 'features' : _bytes_feature(serialize_array(music)),
        'features' : _float_feature(music),
        'track_id' : _int64_feature(track_id)
    }
    #create an Example, wrapping the single features
    out = tf.train.Example(features=tf.train.Features(feature=data))

    return out

In [120]:
def generate_tf_record(df,tf_path='val'):
    create_dir(tf_path)
    
    
    batch_size = 1024 * 50 # 50k records from each file batch
    count = 0
    total = math.ceil(len(df) / batch_size)

    for i in range(0, len(df), batch_size):
        batch_df = df[i:i+batch_size]
        
        tfrecords = [parse_single_music(data, labels) for data in batch_df.values]
        
        path = f"{tf_path}/{str(count).zfill(10)}.tfrecord"
        
        #with tf.python_io.TFRecordWriter(path) as writer:
        with tf.io.TFRecordWriter(path) as writer:
            for tfrecord in tfrecords:
                writer.write(tfrecord.SerializeToString())

        print(f"{count} {len(tfrecords)} {path}")
        count += 1
        print(f"{count}/{total} batchs / {count * batch_size} processed")

    print(f"{count}/{total} batchs / {len(df)} processed")
    
    return tf_path

    

In [121]:
tfrecord_path

'/mnt/disks/data/fma/trains/hierarchical_all/tfrecords'

In [122]:
val_path = generate_tf_record(df_val,tf_path=os.path.join(tfrecord_path,'val'))

0 1114 /mnt/disks/data/fma/trains/hierarchical_all/tfrecords/val/0000000000.tfrecord
1/1 batchs / 51200 processed
1/1 batchs / 1114 processed


In [123]:
test_path = generate_tf_record(df_test,tf_path=os.path.join(tfrecord_path,'test'))

0 1125 /mnt/disks/data/fma/trains/hierarchical_all/tfrecords/test/0000000000.tfrecord
1/1 batchs / 51200 processed
1/1 batchs / 1125 processed


In [124]:
train_path = generate_tf_record(df_train,tf_path=os.path.join(tfrecord_path,'train'))

0 51200 /mnt/disks/data/fma/trains/hierarchical_all/tfrecords/train/0000000000.tfrecord
1/2 batchs / 51200 processed
1 51031 /mnt/disks/data/fma/trains/hierarchical_all/tfrecords/train/0000000001.tfrecord
2/2 batchs / 102400 processed
2/2 batchs / 102231 processed


In [125]:
def create_metadata(metadata_path):

    with open(metadata_path, 'w+') as f:
        f.write(json.dumps({
            'sequence_size': args.sequence_size,
            'n_levels': labels_size,
            'labels_size': [labels['label1_count'],labels['label2_count'],
                           labels['label3_count'],labels['label4_count'],
                           labels['label5_count']],
            'val_path': val_path,
            'train_path': train_path,
            'test_path': test_path,
            'trainset_count': len(df_train),
            'validationset_count': len(df_val),
            'testset_count': len(df_test)
        }))

In [126]:
create_metadata(metadata_path)

In [127]:
job_path

'/mnt/disks/data/fma/trains/hierarchical_all'

In [128]:
tracks_df.to_csv(os.path.join(job_path,"tracks.csv"),index=False)

In [129]:
with open(categories_labels_path, 'r') as f:
    labels = json.loads(f.read())

In [130]:
levels_size = {'level1_size': labels['label1_count']-1,
        'level2_size': labels['label2_count']-1,
        'level3_size': labels['label3_count']-1,
        'level4_size': labels['label4_count']-1,
        'level5_size': labels['label5_count']-1}

In [131]:
levels_size['level1_size']

15